In [106]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import re
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

## Data Processing and Feature Engineering

In [128]:
df = pd.read_csv('train.csv')
df.set_index('id', inplace=True)
# df.drop("clean_title", axis=1, inplace=True)
# # df.drop("Engine Displacement (L)", axis=1, inplace=True)

In [130]:
df = df[['brand', 'model', 'model_year', 'milage', 'fuel_type', 'price']]

In [131]:
def preprocess_engine_data(df, column_name='engine'):
    """
    Preprocesses the engine data in the specified column of a DataFrame.
    
    Parameters:
    df (DataFrame): The DataFrame containing the engine data.
    column_name (str): The name of the column with the engine descriptions.
    debug (bool): If True, prints the entries that fail regex match.
    
    Returns:
    DataFrame: The DataFrame with the original column replaced by structured columns.
    """
       
    # Prepare regex patterns for data extraction
    hp_pattern = re.compile(r'(\d+\.\d+|\d+)\s*HP', re.IGNORECASE)
    disp_pattern = re.compile(r'(\d+\.\d+|\d+)L', re.IGNORECASE)
    cyl_pattern = re.compile(r'(\d+)\s*Cylinder', re.IGNORECASE)
    conf_pattern = re.compile(r'(I\d+|V\d+|Flat \d+|Straight \d+)', re.IGNORECASE)
    turbo_pattern = re.compile(r'turbo', re.IGNORECASE)
    technology_pattern = re.compile(r'DOHC|SOHC|MPFI|GDI|OHV|PDI', re.IGNORECASE)
    
    # Fuel patterns dictionary
    fuel_patterns = {
        'Gasoline': re.compile(r'Gasoline', re.IGNORECASE),
        'Diesel': re.compile(r'Diesel', re.IGNORECASE),
        'Hybrid': re.compile(r'Hybrid', re.IGNORECASE),
        'Electric': re.compile(r'Electric', re.IGNORECASE),
        'Flex Fuel': re.compile(r'Flex Fuel', re.IGNORECASE),
        'Plug-In Electric/Gas': re.compile(r'Plug-In Electric/Gas', re.IGNORECASE)
    }
    
    # Lists to hold extracted data
    horsepower = []
    displacement = []
    cylinders = []
    configuration = []
    fuel_type = []
    turbo = []
    technology = []
    
    # Process each entry in the specified column
    for desc in df[column_name]:
        # Extract and append horsepower
        hp_match = hp_pattern.search(desc)
        horsepower.append(int(float(hp_match.group(1))) if hp_match else None)
    
        # Extract and append displacement
        disp_match = disp_pattern.search(desc)
        displacement.append(disp_match.group(1) if disp_match else None)
    
        # Extract and append cylinder count
        cyl_match = cyl_pattern.search(desc)
        cylinders.append(int(cyl_match.group(1)) if cyl_match else None)
    
        # Extract and append engine configuration
        conf_match = conf_pattern.search(desc)
        configuration.append(conf_match.group(1) if conf_match else None)
    
        # Determine and append fuel type
        detected_fuel_type = None
        for fuel, pattern in fuel_patterns.items():
            if pattern.search(desc):
                detected_fuel_type = fuel
                break
        fuel_type.append(detected_fuel_type if detected_fuel_type else 'Other')
    
        # Check and append turbo presence
        turbo.append('Yes' if turbo_pattern.search(desc) else 'No')
    
        # Extract and append technology terms
        tech_match = technology_pattern.findall(desc)
        technology.append(", ".join(tech_match) if tech_match else None)
    
    # Create a DataFrame from the lists
    new_data = pd.DataFrame({
        'Horsepower': horsepower,
        'Engine Displacement (L)': displacement,
        'Number of Cylinders': cylinders,
        'Engine Configuration': configuration,
        # 'Fuel Type': fuel_type,
        # 'Turbo': turbo,
        # 'Technology': technology
    })
    df.drop(column_name, axis=1, inplace=True)
    # Concatenate the new data with the original DataFrame
    return pd.concat([df, new_data], axis=1)

In [132]:
def preprocess_transmission_data(df, column_name='transmission'):
    # Regex pattern to extract number of speeds and type of transmission
    pattern = re.compile(r'(\d+)-Speed\s+(M/T|A/T)', re.IGNORECASE)
    
    # Lists to hold extracted data
    num_speeds = []
    transmission_type = []
    
    # Process each entry in the specified column
    for entry in df[column_name]:
        match = pattern.search(entry)
        if match:
            # Append the number of speeds and transmission type if pattern matches
            num_speeds.append(int(match.group(1)))
            transmission_type.append(match.group(2))
        else:
            # Handle entries that do not match the pattern
            num_speeds.append(None)
            transmission_type.append(None)
    
    # Add the extracted data as new columns in the DataFrame
    # df['Number of Speeds'] = num_speeds
    df['Transmission Type'] = transmission_type
    df.drop(column_name, axis=1, inplace=True)
    return df

In [137]:
def preprocess_data(df):
    # Drop rows where the target (price) is missing
    df.dropna(subset=["price"], inplace=True)
    df['model_year'] = pd.to_datetime(df['model_year'], format='%Y')
    
    # Selecting features: Here we should focus on features that are numeric and categorical columns that need encoding
    numeric_features = ['milage']
                        # 'Horsepower', 'Number of Cylinders']
    categorical_features = ['brand', 'model', 'fuel_type', 'model_year']
                            # 'Transmission Type', ]
    
    # Creating transformers for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # Combine transformers into a preprocessor step
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    return preprocessor

In [138]:
# df = preprocess_engine_data(df)
# df = preprocess_transmission_data(df)

# Model Training

In [139]:
def train_model(df, preprocessor, m= Ridge()):
    X = df.drop('price', axis=1)
    y = df['price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', m)  # Using Ridge regression with default alpha
    ])
    
    param_grid = {'regressor__alpha': [0.1, 1.0, 10.0]}  # Hyperparameter tuning for Ridge
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    return best_model, rmse


## Linear regression

In [140]:
# Main function to run the whole process
# def main(df):
#     preprocessor = preprocess_data(df)
#     model = LinearRegression() 
#     model = Ridge() # Starting with a simple linear regression model
#     clf, rmse, features  = train_model(df, preprocessor, model)
#     print(f"Root Mean Squared Error: {rmse}")
#     print("Feature Importances (if applicable):")
#     print(features)
#     return [model, rmse, clf]

def main(df):
    preprocessor = preprocess_data(df)
    clf, rmse = train_model(df, preprocessor)
    print(f"Root Mean Squared Error: {rmse}")
    return clf, rmse

# Example usage (assuming df is your DataFrame)
clf, rmse = main(df)

# trained = main(df)
# model = trained[0]
# clf = trained[2]
# print(trained[1])

test = pd.read_csv('test.csv')
test = preprocess_engine_data(test)
test = preprocess_transmission_data(test)

test['price'] = clf.predict(test)
test['price'] = test['price'].round(3)
test[['id', 'price']].to_csv('submission.csv', index=False)
# 50993
#50224.33

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=["price"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model_year'] = pd.to_datetime(df['model_year'], format='%Y')


Root Mean Squared Error: 48424.563600782305


## Decision tree

In [141]:
def train_model(df, preprocessor, model):
    X = df.drop('price', axis=1)
    y = df['price']
    # print(X.info())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('regressor', model)])

    # Setup Grid Search Parameters
    param_grid = {
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 10, 20],
        'regressor__min_samples_leaf': [1, 5, 10]
    }

    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_clf = grid_search.best_estimator_

    if hasattr(best_clf.named_steps['regressor'], 'feature_importances_'):
        feature_names = get_feature_names(best_clf.named_steps['preprocessor'])
        feature_importances = best_clf.named_steps['regressor'].feature_importances_
        features = pd.DataFrame({
            'Feature': feature_names,
            'Importance': feature_importances
        }).sort_values(by='Importance', ascending=False)
    else:
        features = pd.DataFrame(columns=['Feature', 'Importance'])

    y_pred = best_clf.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return best_clf, rmse, features

def main(df):
    preprocessor = preprocess_data(df)
    model = DecisionTreeRegressor(random_state=42)
    clf, rmse, features = train_model(df, preprocessor, model)
    print(f"Root Mean Squared Error with Decision Tree: {rmse}")

    return [model, rmse, clf]


In [142]:
trained = main(df)
model = trained[0]
clf = trained[2]
print(trained[1])

test = pd.read_csv('test.csv')
test = preprocess_engine_data(test)
test = preprocess_transmission_data(test)

test['price'] = clf.predict(test)
test['price'] = test['price'].round(3)
test[['id', 'price']].to_csv('submission.csv',index=False)

Root Mean Squared Error with Decision Tree: 52572.412067574725
52572.412067574725


## Random Forrest

In [None]:
def main(df):
    preprocessor = preprocess_data(df)
    model = RandomForestRegressor(random_state=42)  # Using a Random Forest Regressor
    clf, rmse, features = train_model(df, preprocessor, model)
    print(f"Root Mean Squared Error with Random Forest: {rmse}")
  
    return [model, rmse, clf]

# Assuming df is your training DataFrame
trained = main(df)
model = trained[0]
clf = trained[2]
print(trained[1])

# Load and preprocess test data
test = pd.read_csv('test.csv')
# Assuming preprocess_engine_data and preprocess_transmission_data are applicable and defined correctly
test = preprocess_engine_data(test)  # Make sure this function is defined and operates correctly
test = preprocess_transmission_data(test)  # Make sure this function is defined and operates correctly

# Predict using the trained clf (pipeline with preprocessor and regressor)
test['price'] = clf.predict(test)
test['price'] = test['price'].round(3)
test[['id', 'price']].to_csv('submission.csv', index=False)