LinearRegression

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

df = pd.read_csv('Cardheko_final.csv')
df.fillna(method='ffill', inplace=True)
df['Age_of_the_Car'] = 2024 - df['modelYear'] #Car_age_calculating

categorical_cols = df.select_dtypes(include=['object']).columns

le = LabelEncoder()

for col in categorical_cols:
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])

# Separate features and target variable
x = df.drop(['price'], axis=1)
y = df['price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(x_train, y_train)

train_prediction = model.predict(x_train)
test_prediction = model.predict(x_test)

print('-----Train-----')
train_mse = mean_squared_error(y_train, train_prediction)
train_r2 = r2_score(y_train, train_prediction)
print(f"train_mse: {train_mse}")
print(f"train_r2: {train_r2}")

print('-----Test-----')
test_mse = mean_squared_error(y_test, test_prediction)
test_r2 = r2_score(y_test, test_prediction)
print(f"test_mse: {test_mse}")
print(f"test_r2: {test_r2}")
print("\n")

  df.fillna(method='ffill', inplace=True)


-----Train-----
train_mse: 416128323682.7289
train_r2: 0.6118384601804481
-----Test-----
test_mse: 567205982446.4929
test_r2: 0.5813106721528956




In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

df = pd.read_csv('Cardheko_final.csv')


df['Age_of_the_Car'] = 2024 - df['modelYear'] #Car_age_calculating

categorical_cols = df.select_dtypes(include=['object']).columns #encoding
le = LabelEncoder()
for col in categorical_cols:
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])

df.fillna(method='ffill', inplace=True)

x = df.drop(['price'], axis=1)
y = df['price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

# Define models
models = {
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor()
}

# Hyperparameters for tuning
param_grid = {
    'DecisionTree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20],
        'min_samples_leaf': [1, 5, 10]
    },
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

# Perform GridSearchCV for each model
best_models = {}
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='r2')
    grid_search.fit(x_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

# Evaluate the best models
for name, model in best_models.items():
    train_prediction = model.predict(x_train)
    test_prediction = model.predict(x_test)
    print(f"\nModel: {name}")
    
    print('-----Train-----')
    train_mae = mean_absolute_error(y_train, train_prediction)
    train_mse = mean_squared_error(y_train, train_prediction)
    train_r2 = r2_score(y_train, train_prediction)
    print(f"train_mae: {train_mae}")
    print(f"train_mse: {train_mse}")
    print(f"train_r2: {train_r2}")
    
    print('-----Test-----')
    test_mae = mean_absolute_error(y_test, test_prediction)
    test_mse = mean_squared_error(y_test, test_prediction)
    test_r2 = r2_score(y_test, test_prediction)
    print(f"test_mae: {test_mae}")
    print(f"test_mse: {test_mse}")
    print(f"test_r2: {test_r2}")



  df.fillna(method='ffill', inplace=True)


Best parameters for DecisionTree: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 20}
Best parameters for RandomForest: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best parameters for GradientBoosting: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}

Model: DecisionTree
-----Train-----
train_mae: 147830.1533556213
train_mse: 121258845581.59732
train_r2: 0.8919805085883908
-----Test-----
test_mae: 189913.90399999978
test_mse: 241571121633.35797
test_r2: 0.7961877696491189

Model: RandomForest
-----Train-----
train_mae: 53371.16787144558
train_mse: 18993580467.222473
train_r2: 0.9830801877395886
-----Test-----
test_mae: 129735.07914036594
test_mse: 103717951364.5865
test_r2: 0.9124937333067316

Model: GradientBoosting
-----Train-----
train_mae: 57745.91421653722
train_mse: 6356767192.761627
train_r2: 0.9943372810792426
-----Test-----
test_mae: 124833.78578816922
test_mse: 73570567759.75194
test_r2: 0.9379289155015245


In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

df = pd.read_csv('Cardheko_final.csv')

df['Age_of_the_Car'] = 2024 - df['modelYear']  # Car_age_calculating

categorical_cols = df.select_dtypes(include=['object']).columns  # encoding
le = LabelEncoder()
for col in categorical_cols:
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])

df.fillna(method='ffill', inplace=True)

x = df.drop(['price'], axis=1)
y = df['price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

gb = GradientBoostingRegressor(learning_rate=0.2, max_depth=5, n_estimators=200)
gb.fit(x_train, y_train)

train_pred = gb.predict(x_train)
test_pred = gb.predict(x_test)

  df.fillna(method='ffill', inplace=True)


In [14]:
import pickle

# Save the model to a .pkl file
with open('CDmodel.pkl', 'wb') as file:
    pickle.dump(gb, file)

In [16]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import pickle

df = pd.read_csv('Cardheko_final.csv')

df['Age_of_the_Car'] = 2024 - df['modelYear']  # Car_age_calculating

categorical_cols = df.select_dtypes(include=['object']).columns  # encoding
le = LabelEncoder()
for col in categorical_cols:
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])

df.fillna(method='ffill', inplace=True)

x = df.drop(['price'], axis=1)
y = df['price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Load the model from the .pkl file
with open('CDmodel.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Use the loaded model for prediction
train_pred = loaded_model.predict(x_train)
test_pred = loaded_model.predict(x_test)

print(f"Train MSE: {mean_squared_error(y_train, train_pred)}")
print(f"Test MSE: {mean_squared_error(y_test, test_pred)}")
print(f"Train R^2: {r2_score(y_train, train_pred)}")
print(f"Test R^2: {r2_score(y_test, test_pred)}")

Train MSE: 25648773981.119724
Test MSE: 24749895672.33688
Train R^2: 0.9772853584479851
Test R^2: 0.9780470414264774


  df.fillna(method='ffill', inplace=True)


In [None]:

df = pd.read_csv('Cardheko_final.csv')

# Calculate the age of the car
df['Age_of_the_Car'] = 2024 - df['modelYear']

# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])

# Fill NaN values
df.fillna(method='ffill', inplace=True)

# Splitting the data
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Define a pipeline with scaling and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', ElasticNet())
])

# Define the parameter grid for Bayesian Optimization
param_grid = {
    'model__alpha': (0.1, 100.0, 'log-uniform'),
    'model__l1_ratio': (0.0, 1.0)
}

# Implementing k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Bayesian Optimization with cross-validation
bayes_search = BayesSearchCV(pipeline, param_grid, cv=kf, scoring='neg_mean_absolute_error', n_jobs=-1, n_iter=50, random_state=42)
bayes_search.fit(X_train_poly, y_train)

# Best model from Bayesian Optimization
best_model = bayes_search.best_estimator_

# Predictions
y_train_pred = best_model.predict(X_train_poly)
y_test_pred = best_model.predict(X_test_poly)

# Evaluation
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("-----Train-----")
print(f"train_mae: {train_mae}")
print(f"train_mse: {train_mse}")
print(f"train_r2: {train_r2}")

print("-----Test-----")
print(f"test_mae: {test_mae}")
print(f"test_mse: {test_mse}")
print(f"test_r2: {test_r2}")

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [6]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return mae, mse, r2

In [7]:
df = pd.read_csv('Cardheko_final.csv')

# Calculate the age of the car
df['Age_of_the_Car'] = 2024 - df['modelYear']

# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])

# Fill NaN values
df.fillna(method='ffill', inplace=True)

X = df.drop('price', axis=1)  
y = df['price']  
# Split the dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
mae_lr, mse_lr, r2_lr = evaluate_model(lr, X_test, y_test)
print(f"Linear Regression - MAE: {mae_lr}, MSE: {mse_lr}, R2: {r2_lr}")


Linear Regression - MAE: 396614.11778851406, MSE: 567204007529.4952, R2: 0.581312129959576


  df.fillna(method='ffill', inplace=True)


In [8]:
# 2. Decision Tree with Grid Search
param_grid_dt = {
    'max_depth': [None, 10, 20,30,40,50],
    'min_samples_split': [2, 5, 10,30,50]
}
dt = DecisionTreeRegressor(random_state=42)
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='neg_mean_squared_error')
grid_search_dt.fit(X_train, y_train)
best_dt = grid_search_dt.best_estimator_
mae_dt, mse_dt, r2_dt = evaluate_model(best_dt, X_test, y_test)
print(f"Decision Tree - MAE: {mae_dt}, MSE: {mse_dt}, R2: {r2_dt}")
print(f"Best Decision Tree Params: {grid_search_dt.best_params_}")

Decision Tree - MAE: 185259.1253427112, MSE: 250263789890.8981, R2: 0.8152650338384368
Best Decision Tree Params: {'max_depth': 20, 'min_samples_split': 10}


In [9]:
# 3. Gradient Boosting with Grid Search
param_grid_gb = {
    'n_estimators': [300,400,800,1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3,7,10]
}
gb = GradientBoostingRegressor(random_state=42)
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='neg_mean_squared_error')
grid_search_gb.fit(X_train, y_train)
best_gb = grid_search_gb.best_estimator_
mae_gb, mse_gb, r2_gb = evaluate_model(best_gb, X_test, y_test)
print(f"Gradient Boosting - MAE: {mae_gb}, MSE: {mse_gb}, R2: {r2_gb}")
print(f"Best Gradient Boosting Params: {grid_search_gb.best_params_}")

KeyboardInterrupt: 

In [None]:
#4. Random Forest with Grid Search
param_grid_rf = {
    'n_estimators': [400,800,1000, 1500],
    'max_depth': [None, 10, 20,40,50],
    'min_samples_split': [2, 5, 10,30,50]
}
rf = RandomForestRegressor(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
mae_rf, mse_rf, r2_rf = evaluate_model(best_rf, X_test, y_test)
print(f"Random Forest - MAE: {mae_rf}, MSE: {mse_rf}, R2: {r2_rf}")
print(f"Best Random Forest Params: {grid_search_rf.best_params_}")

In [None]:
### 5. K-Nearest Neighbors (KNN) with Grid Search
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan, 2 for Euclidean
}
knn = KNeighborsRegressor()
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='neg_mean_squared_error')
grid_search_knn.fit(X_train, y_train)
best_knn = grid_search_knn.best_estimator_
mae_knn, mse_knn, r2_knn = evaluate_model(best_knn, X_test, y_test)
print(f"KNN - MAE: {mae_knn}, MSE: {mse_knn}, R2: {r2_knn}")
print(f"Best KNN Params: {grid_search_knn.best_params_}")

In [None]:
# Collect results for each model
results = {
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting', 'KNN'],
    'MAE': [mae_lr, mae_dt, mae_rf, mae_gb, mae_knn],
    'MSE': [mse_lr, mse_dt, mse_rf, mse_gb, mse_knn],
    'R2': [r2_lr, r2_dt, r2_rf, r2_gb, r2_knn]
}
# Create DataFrame for comparison
results_df = pd.DataFrame(results)
print(results_df)