In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import re

## Data Processing and Feature Engineering

In [15]:
df = pd.read_csv('train.csv')

In [16]:
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [17]:
def preprocess_engine_data(df, column_name='engine'):
    """
    Preprocesses the engine data in the specified column of a DataFrame.
    
    Parameters:
    df (DataFrame): The DataFrame containing the engine data.
    column_name (str): The name of the column with the engine descriptions.
    debug (bool): If True, prints the entries that fail regex match.
    
    Returns:
    DataFrame: The DataFrame with the original column replaced by structured columns.
    """
       
    # Prepare regex patterns for data extraction
    hp_pattern = re.compile(r'(\d+\.\d+|\d+)\s*HP', re.IGNORECASE)
    disp_pattern = re.compile(r'(\d+\.\d+|\d+)L', re.IGNORECASE)
    cyl_pattern = re.compile(r'(\d+)\s*Cylinder', re.IGNORECASE)
    conf_pattern = re.compile(r'(I\d+|V\d+|Flat \d+|Straight \d+)', re.IGNORECASE)
    turbo_pattern = re.compile(r'turbo', re.IGNORECASE)
    technology_pattern = re.compile(r'DOHC|SOHC|MPFI|GDI|OHV|PDI', re.IGNORECASE)
    
    # Fuel patterns dictionary
    fuel_patterns = {
        'Gasoline': re.compile(r'Gasoline', re.IGNORECASE),
        'Diesel': re.compile(r'Diesel', re.IGNORECASE),
        'Hybrid': re.compile(r'Hybrid', re.IGNORECASE),
        'Electric': re.compile(r'Electric', re.IGNORECASE),
        'Flex Fuel': re.compile(r'Flex Fuel', re.IGNORECASE),
        'Plug-In Electric/Gas': re.compile(r'Plug-In Electric/Gas', re.IGNORECASE)
    }
    
    # Lists to hold extracted data
    horsepower = []
    displacement = []
    cylinders = []
    configuration = []
    fuel_type = []
    turbo = []
    technology = []
    
    # Process each entry in the specified column
    for desc in df[column_name]:
        # Extract and append horsepower
        hp_match = hp_pattern.search(desc)
        horsepower.append(int(float(hp_match.group(1))) if hp_match else None)
    
        # Extract and append displacement
        disp_match = disp_pattern.search(desc)
        displacement.append(disp_match.group(1) if disp_match else None)
    
        # Extract and append cylinder count
        cyl_match = cyl_pattern.search(desc)
        cylinders.append(int(cyl_match.group(1)) if cyl_match else None)
    
        # Extract and append engine configuration
        conf_match = conf_pattern.search(desc)
        configuration.append(conf_match.group(1) if conf_match else None)
    
        # Determine and append fuel type
        detected_fuel_type = None
        for fuel, pattern in fuel_patterns.items():
            if pattern.search(desc):
                detected_fuel_type = fuel
                break
        fuel_type.append(detected_fuel_type if detected_fuel_type else 'Other')
    
        # Check and append turbo presence
        turbo.append('Yes' if turbo_pattern.search(desc) else 'No')
    
        # Extract and append technology terms
        tech_match = technology_pattern.findall(desc)
        technology.append(", ".join(tech_match) if tech_match else None)
    
    # Create a DataFrame from the lists
    new_data = pd.DataFrame({
        'Horsepower': horsepower,
        'Engine Displacement (L)': displacement,
        'Number of Cylinders': cylinders,
        'Engine Configuration': configuration,
        # 'Fuel Type': fuel_type,
        'Turbo': turbo,
        'Technology': technology
    })
    df.drop(column_name, axis=1, inplace=True)
    # Concatenate the new data with the original DataFrame
    return pd.concat([df, new_data], axis=1)

In [18]:
def preprocess_transmission_data(df, column_name='transmission'):
    # Regex pattern to extract number of speeds and type of transmission
    pattern = re.compile(r'(\d+)-Speed\s+(M/T|A/T)', re.IGNORECASE)
    
    # Lists to hold extracted data
    num_speeds = []
    transmission_type = []
    
    # Process each entry in the specified column
    for entry in df[column_name]:
        match = pattern.search(entry)
        if match:
            # Append the number of speeds and transmission type if pattern matches
            num_speeds.append(int(match.group(1)))
            transmission_type.append(match.group(2))
        else:
            # Handle entries that do not match the pattern
            num_speeds.append(None)
            transmission_type.append(None)
    
    # Add the extracted data as new columns in the DataFrame
    df['Number of Speeds'] = num_speeds
    df['Transmission Type'] = transmission_type
    df.drop(column_name, axis=1, inplace=True)
    return df

In [19]:
def preprocess_data(df):
    # Drop rows where the target (price) is missing
    df.dropna(subset=["price"], inplace=True)
    
    # Selecting features: Here we should focus on features that are numeric and categorical columns that need encoding
    numeric_features = ['mileage', 'model_year', 'Horsepower', 'Number of Cylinders']
    categorical_features = ['brand', 'model', 'fuel_type', 'Transmission Type']
    
    # Creating transformers for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # Combine transformers into a preprocessor step
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    return preprocessor

In [6]:
df = preprocess_engine_data(df)
df = preprocess_transmission_data(df)

In [20]:
preprocess_data(df)

# Model Training

In [21]:
def train_model(df, preprocessor, model):
    # Define the features and target
    X = df.drop('price', axis=1)
    y = df['price']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create a full pipeline with preprocessing and model
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', model)])
    
    # Train the model
    clf.fit(X_train, y_train)
    
    # Predict and evaluate the model
    y_pred = clf.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return clf, rmse

# Main function to run the whole process
def main(filepath):
    df = load_data(filepath)
    preprocessor = preprocess_data(df)
    model = LinearRegression()  # Starting with a simple linear regression model
    clf, rmse = train_model(df, preprocessor, model)
    print(f"Root Mean Squared Error: {rmse}")
    return clf
