In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import os

# Load the train data
cwd = os.getcwd()
train_data = pd.read_csv(os.path.join(cwd, 'data', 'train.csv'))

# Step 1: Handling Missing Values (none in this case)

# Step 2: Encoding Categorical Variables
categorical_cols = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']


# One-Hot Encoding for 'brand'
ohe = OneHotEncoder()
brand_ohe = ohe.fit_transform(train_data[['brand']])
brand_ohe_cols = [f'brand_{i}' for i in range(brand_ohe.shape[1])]
brand_ohe_df = pd.DataFrame(brand_ohe.toarray(), columns=brand_ohe_cols)
train_data = pd.concat([train_data, brand_ohe_df], axis=1)
train_data.drop('brand', axis=1, inplace=True)

# Label Encoding for 'model'
le = LabelEncoder()
train_data['model'] = le.fit_transform(train_data['model'])

# Label Encoding for other categorical columns
for col in categorical_cols[1:]:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])

# Step 3: Scaling/Normalizing Numerical Features
numerical_cols = ['id', 'model_year', 'milage', 'price']

scaler = StandardScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])


In [2]:
train_data.shape

(54273, 65)

In [3]:

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import numpy as np

# **Feature Selection or Extraction Methods:**
# 1. Correlation Analysis
corr_matrix = train_data.corr()
corr_with_price = corr_matrix['price'].abs().sort_values(ascending=False)
print("Correlation with price:")
print(corr_with_price)

# 2. Recursive Feature Elimination (RFE)
X = train_data.drop('price', axis=1)
y = train_data['price']
lr_model = LinearRegression()
rfe = RFE(lr_model, n_features_to_select=10)
rfe.fit(X, y)
print("RFE selected features:")
print(X.columns[rfe.support_])

# 3. Principal Component Analysis (PCA)
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
print("PCA explained variance ratio:")
print(pca.explained_variance_ratio_)

# Choose the best features based on the results
best_features = corr_with_price.nlargest(5).index.tolist() + X.columns[rfe.support_].tolist()
best_features = list(set(best_features))  # remove duplicates
print("Best features:")
print(best_features)

# Create a new DataFrame with the best features
train_data_best = train_data[best_features + ['price']]
print("Best features DataFrame:")
print(train_data_best.head())


Correlation with price:
price           1.000000
milage          0.248927
engine          0.217236
model_year      0.216150
accident        0.114705
                  ...   
fuel_type       0.001832
transmission    0.000860
id              0.000587
brand_28        0.000156
clean_title          NaN
Name: price, Length: 65, dtype: float64
RFE selected features:
Index(['brand_12', 'brand_17', 'brand_30', 'brand_36', 'brand_37', 'brand_39',
       'brand_40', 'brand_45', 'brand_46', 'brand_48'],
      dtype='object')
PCA explained variance ratio:
[7.87459597e-01 1.84312029e-01 2.54057685e-02 2.46809857e-03
 3.43458366e-04 3.83000168e-06 2.68059607e-06 1.05310570e-06
 6.13156801e-07 4.66086632e-07]
Best features:
['brand_12', 'brand_17', 'brand_48', 'brand_30', 'accident', 'brand_36', 'brand_45', 'price', 'brand_40', 'milage', 'brand_37', 'model_year', 'brand_46', 'engine', 'brand_39']
Best features DataFrame:
   brand_12  brand_17  brand_48  brand_30  accident  brand_36  brand_45  \
0     

In [4]:
train_data_best

Unnamed: 0,brand_12,brand_17,brand_48,brand_30,accident,brand_36,brand_45,price,brand_40,milage,brand_37,model_year,brand_46,engine,brand_39,price.1
0,0.0,0.0,0.0,0.0,1,0.0,0.0,-0.387479,0.0,0.031759,0.0,0.520325,0.0,719,0.0,-0.387479
1,0.0,0.0,0.0,0.0,1,0.0,0.0,-0.425241,0.0,0.143728,0.0,-1.447877,0.0,534,0.0,-0.425241
2,0.0,0.0,0.0,0.0,1,0.0,0.0,-0.332554,0.0,0.371412,0.0,-1.090022,0.0,541,0.0,-0.332554
3,0.0,0.0,0.0,0.0,1,0.0,0.0,0.333420,0.0,-1.393115,0.0,1.236035,0.0,646,0.0,0.333420
4,0.0,0.0,0.0,0.0,1,0.0,0.0,-0.430733,1.0,0.757966,0.0,-2.521442,0.0,219,0.0,-0.430733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54268,0.0,0.0,0.0,0.0,1,0.0,0.0,-0.140314,0.0,-0.866793,0.0,0.341397,0.0,856,0.0,-0.140314
54269,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.449271,0.0,0.433688,0.0,-0.016458,0.0,266,0.0,-0.449271
54270,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.278315,0.0,-0.629241,0.0,-0.374313,0.0,817,0.0,-0.278315
54271,0.0,0.0,0.0,0.0,1,0.0,0.0,2.138622,0.0,-1.411325,0.0,1.414962,0.0,762,0.0,2.138622


In [6]:
train_data_best.shape

(54273, 16)

In [57]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

X = train_data_best.drop('price', axis=1)
y = train_data_best['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

algorithms = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(n_estimators=100),
    XGBRegressor(n_estimators=100),
    MLPRegressor(hidden_layer_sizes=(50, 50))
]

results = {}
for algo in algorithms:
    algo_name = algo.__class__.__name__
    print(f"Training {algo_name}...")
    algo.fit(X_train, y_train)
    y_pred = algo.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    results[algo_name] = {'MSE': mse, 'RMSE': rmse, 'R2': r2}
    print(f"{algo_name} - MSE: {mse:.2f}, RMSE: {rmse:.2f}, R2: {r2:.2f}\n")

print("Results:")
for alg, metrics in results.items():
    print(f"{alg}:")
    print(f"  MSE: {metrics['MSE']:.2f}")
    print(f"  RMSE: {metrics['RMSE']:.2f}")
    print(f"  R2: {metrics['R2']:.2f}\n")

best_algo = max(results, key=lambda x: results[x]['R2'])
print(f"Best algorithm: {best_algo}")


Training LinearRegression...
LinearRegression - MSE: 0.47, RMSE: 0.68, R2: 0.16

Training DecisionTreeRegressor...
DecisionTreeRegressor - MSE: 2.09, RMSE: 1.44, R2: -2.77

Training RandomForestRegressor...
RandomForestRegressor - MSE: 0.60, RMSE: 0.78, R2: -0.09

Training XGBRegressor...
XGBRegressor - MSE: 0.74, RMSE: 0.86, R2: -0.33

Training MLPRegressor...
MLPRegressor - MSE: 0.55, RMSE: 0.74, R2: 0.01

Results:
LinearRegression:
  MSE: 0.47
  RMSE: 0.68
  R2: 0.16

DecisionTreeRegressor:
  MSE: 2.09
  RMSE: 1.44
  R2: -2.77

RandomForestRegressor:
  MSE: 0.60
  RMSE: 0.78
  R2: -0.09

XGBRegressor:
  MSE: 0.74
  RMSE: 0.86
  R2: -0.33

MLPRegressor:
  MSE: 0.55
  RMSE: 0.74
  R2: 0.01

Best algorithm: LinearRegression


In [60]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X = train_data_best.drop('price', axis=1)
y = train_data_best['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K-Fold Cross-Validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)

scores = []
for train_index, test_index in kf.split(X_train):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

    model = LinearRegression()
    model.fit(X_train_cv, y_train_cv)
    y_pred_cv = model.predict(X_test_cv)

    r2 = r2_score(y_test_cv, y_pred_cv)
    scores.append(r2)

print(f"Average R2 score from K-fold cross-validation: {np.mean(scores):.2f}")

# Grid Search for Hyperparameter Tuning
param_grid = {
    'fit_intercept': [True, False],
}

grid_search = GridSearchCV(estimator=LinearRegression(), param_grid=param_grid, scoring='r2', cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best parameters found by grid search: {best_params}")

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best model - MSE: {mse_best:.2f}, RMSE: {rmse_best:.2f}, R2: {r2_best:.2f}")


Average R2 score from K-fold cross-validation: 0.09
Best parameters found by grid search: {'fit_intercept': True}
Best model - MSE: 0.47, RMSE: 0.68, R2: 0.16


In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

X = train_data_best.drop('price', axis=1)
y = train_data_best['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K-Fold Cross-Validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)

scores = []
for train_index, test_index in kf.split(X_train):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

    model = XGBRegressor(n_estimators=100)
    model.fit(X_train_cv, y_train_cv)
    y_pred_cv = model.predict(X_test_cv)

    r2 = r2_score(y_test_cv, y_pred_cv)
    scores.append(r2)

print(f"Average R2 score from K-fold cross-validation: {np.mean(scores):.2f}")

# Grid Search for Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(estimator=XGBRegressor(), param_grid=param_grid, scoring='r2', cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best parameters found by grid search: {best_params}")

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best model - MSE: {mse_best:.2f}, RMSE: {rmse_best:.2f}, R2: {r2_best:.2f}")


Average R2 score from K-fold cross-validation: -0.12
Best parameters found by grid search: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best model - MSE: 0.45, RMSE: 0.67, R2: 0.19


In [7]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

X = train_data_best.drop('price', axis=1)
y = train_data_best['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K-Fold Cross-Validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)

scores = []
for train_index, test_index in kf.split(X_train):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

    model = RandomForestRegressor(n_estimators=100)
    model.fit(X_train_cv, y_train_cv)
    y_pred_cv = model.predict(X_test_cv)

    r2 = r2_score(y_test_cv, y_pred_cv)
    scores.append(r2)

print(f"Average R2 score from K-fold cross-validation: {np.mean(scores):.2f}")

# Grid Search for Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, scoring='r2', cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best parameters found by grid search: {best_params}")

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best model - MSE: {mse_best:.2f}, RMSE: {rmse_best:.2f}, R2: {r2_best:.2f}")


Average R2 score from K-fold cross-validation: -0.06


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split the data into features and target
X = train_data_best.drop('price', axis=1)
y = train_data_best['price']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K-Fold Cross-Validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)

scores = []
for train_index, test_index in kf.split(X_train):
    X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

    # Create a pipeline that scales the data then applies MLPRegressor
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42))
    ])

    # Fit the model
    pipeline.fit(X_train_cv, y_train_cv)
    y_pred_cv = pipeline.predict(X_test_cv)

    # Calculate and store the R2 score
    r2 = r2_score(y_test_cv, y_pred_cv)
    scores.append(r2)

print(f"Average R2 score from K-fold cross-validation: {np.mean(scores):.2f}")

# Grid Search for Hyperparameter Tuning
param_grid = {
    'mlp__hidden_layer_sizes': [(50, 50), (100, 100), (150, 150)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam', 'lbfgs'],
    'mlp__alpha': [0.0001, 0.001, 0.01],
    'mlp__max_iter': [1000]  # Set a higher max_iter to ensure convergence
}

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(random_state=42))
])

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='r2', cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best parameters found by grid search: {best_params}")

# Use the best model to predict on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best model - MSE: {mse_best:.2f}, RMSE: {rmse_best:.2f}, R2: {r2_best:.2f}")


