In [1]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import matplotlib.pyplot as plt

# # Setup logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the training data
df = pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_small_train_data_v1.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df.fillna(df.mean(), inplace=True)

# Encode categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Separate features and target
X = df.drop(columns=['price'])
y = np.log(df['price'])  # Predict the natural log of the price

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ]
)

# Define models
models = {
    'linear_regression': LinearRegression(),
    'lasso': Lasso(),
    'ridge': Ridge(),
    'decision_tree': DecisionTreeRegressor()
}

# Create pipelines
pipelines = {}
for model_name, model in models.items():
    pipelines[model_name] = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

def objective(trial, model_name):
    params = {}
    if model_name == 'lasso':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'ridge':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'decision_tree':
        params['model__max_depth'] = trial.suggest_int('model__max_depth', 3, 20)
        params['model__min_samples_split'] = trial.suggest_int('model__min_samples_split', 2, 20)
        params['model__min_samples_leaf'] = trial.suggest_int('model__min_samples_leaf', 1, 20)

    model_pipeline = pipelines[model_name].set_params(**params)
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(model_pipeline, X_train, y_train, scoring='r2', cv=kf)
    return scores.mean()

# Hyperparameter tuning for each model
best_params = {}
for model_name in models.keys():
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=20)
    best_params[model_name] = study.best_params

# Train and evaluate each model with the best parameters
best_model_name = None
best_model = None
best_r2 = -np.inf
best_mse = np.inf

for model_name, model in models.items():
    model_pipeline = pipelines[model_name].set_params(**best_params[model_name])
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"{model_name} R2 Score: {r2:.4f}")
    print(f"{model_name} MSE: {mse:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_mse = mse
        best_model_name = model_name
        best_model = model_pipeline

print(f"Best Model: {best_model_name}")
print(f"Best Model R2 Score: {best_r2:.4f}")
print(f"Best Model MSE: {best_mse:.4f}")



[I 2024-07-16 15:34:34,580] A new study created in memory with name: no-name-ee78ed37-cc51-47ec-9ab5-85b4179d5024
[I 2024-07-16 15:34:34,960] Trial 0 finished with value: 0.4095493025674763 and parameters: {}. Best is trial 0 with value: 0.4095493025674763.
[I 2024-07-16 15:34:35,031] Trial 1 finished with value: 0.4095493025674763 and parameters: {}. Best is trial 0 with value: 0.4095493025674763.
[I 2024-07-16 15:34:35,118] Trial 2 finished with value: 0.4095493025674763 and parameters: {}. Best is trial 0 with value: 0.4095493025674763.
[I 2024-07-16 15:34:35,200] Trial 3 finished with value: 0.4095493025674763 and parameters: {}. Best is trial 0 with value: 0.4095493025674763.
[I 2024-07-16 15:34:35,271] Trial 4 finished with value: 0.4095493025674763 and parameters: {}. Best is trial 0 with value: 0.4095493025674763.
[I 2024-07-16 15:34:35,355] Trial 5 finished with value: 0.4095493025674763 and parameters: {}. Best is trial 0 with value: 0.4095493025674763.
[I 2024-07-16 15:34:35

linear_regression R2 Score: 0.3886
linear_regression MSE: 0.0196
lasso R2 Score: 0.2923
lasso MSE: 0.0227
ridge R2 Score: 0.3886
ridge MSE: 0.0196
decision_tree R2 Score: 0.5879
decision_tree MSE: 0.0132
Best Model: decision_tree
Best Model R2 Score: 0.5879
Best Model MSE: 0.0132


In [2]:
# Load new dataset for predictions
new_df = pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_test_without_response_variable_data_v1.csv')

# Handle missing values in the new dataset
new_df.fillna(new_df.mean(), inplace=True)

# Encode categorical features in the new dataset
for column in new_df.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        new_df[column] = label_encoders[column].transform(new_df[column])

# Separate features
X_new = new_df

# Predict with the best model
predictions = best_model.predict(X_new)

# Output predictions
predicted_price = pd.DataFrame(predictions)
print(predicted_price)

# Save the predictions to a new CSV file
predicted_price.to_csv(r'C:\Users\Lenovo\Downloads\869 assignment1.resubmit.csv', index=False,header= False)


              0
0      9.630929
1      9.539975
2      9.689454
3      9.317583
4      9.444575
...         ...
19995  9.865901
19996  9.825225
19997  9.756903
19998  9.569725
19999  9.356007

[20000 rows x 1 columns]


In [4]:
response=pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_test_with_response_variable_data_v1.csv')
response['price']=np.log(response['price'])
r2 = r2_score(predicted_price, response['price'])
mse = mean_squared_error(predicted_price, response['price'])
print(r2)
print(mse)

0.3548276740868045
0.014052327661407552


In [7]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import matplotlib.pyplot as plt

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the training data
df = pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_small_train_data_v1.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df.fillna(df.mean(), inplace=True)

# Encode categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

#To add new features 
df['avg_fuel_economy']=(df['city_fuel_economy']+df['highway_fuel_economy'])/2
df['age']=2024-df['year']
df['mileage_per_year']=(df['mileage']/df['age']).round()
df['horsepower_per_cc']=(df['horsepower']/df['engine_displacement']).round(4)
df['economy_per_cc']=(df['avg_fuel_economy']/df['engine_displacement']).round(4)

# Separate features and target
X = df.drop(columns=['price'])
y = np.log(df['price'])  # Predict the natural log of the price

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ]
)

# Define models
models = {
    'linear_regression': LinearRegression(),
    'lasso': Lasso(),
    'ridge': Ridge(),
    'decision_tree': DecisionTreeRegressor()
}

# Create pipelines
pipelines = {}
for model_name, model in models.items():
    pipelines[model_name] = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

def objective(trial, model_name):
    params = {}
    if model_name == 'lasso':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'ridge':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'decision_tree':
        params['model__max_depth'] = trial.suggest_int('model__max_depth', 3, 20)
        params['model__min_samples_split'] = trial.suggest_int('model__min_samples_split', 2, 20)
        params['model__min_samples_leaf'] = trial.suggest_int('model__min_samples_leaf', 1, 20)

    model_pipeline = pipelines[model_name].set_params(**params)
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(model_pipeline, X_train, y_train, scoring='r2', cv=kf)
    return scores.mean()

# Hyperparameter tuning for each model
best_params = {}
for model_name in models.keys():
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=20)
    best_params[model_name] = study.best_params

# Train and evaluate each model with the best parameters
best_model_name = None
best_model = None
best_r2 = -np.inf
best_mse = np.inf

for model_name, model in models.items():
    model_pipeline = pipelines[model_name].set_params(**best_params[model_name])
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"{model_name} R2 Score: {r2:.4f}")
    print(f"{model_name} MSE: {mse:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_mse = mse
        best_model_name = model_name
        best_model = model_pipeline

print(f"Best Model: {best_model_name}")
print(f"Best Model R2 Score: {best_r2:.4f}")
print(f"Best Model MSE: {best_mse:.4f}")



[I 2024-07-16 15:46:41,751] A new study created in memory with name: no-name-51f8c2cc-343b-4a06-b7e5-d4246bd10a4d
[I 2024-07-16 15:46:42,192] Trial 0 finished with value: 0.4326905298826893 and parameters: {}. Best is trial 0 with value: 0.4326905298826893.
[I 2024-07-16 15:46:42,328] Trial 1 finished with value: 0.4326905298826893 and parameters: {}. Best is trial 0 with value: 0.4326905298826893.
[I 2024-07-16 15:46:42,453] Trial 2 finished with value: 0.4326905298826893 and parameters: {}. Best is trial 0 with value: 0.4326905298826893.
[I 2024-07-16 15:46:42,575] Trial 3 finished with value: 0.4326905298826893 and parameters: {}. Best is trial 0 with value: 0.4326905298826893.
[I 2024-07-16 15:46:42,727] Trial 4 finished with value: 0.4326905298826893 and parameters: {}. Best is trial 0 with value: 0.4326905298826893.
[I 2024-07-16 15:46:42,844] Trial 5 finished with value: 0.4326905298826893 and parameters: {}. Best is trial 0 with value: 0.4326905298826893.
[I 2024-07-16 15:46:42

linear_regression R2 Score: 0.4082
linear_regression MSE: 0.0190
lasso R2 Score: -0.0002
lasso MSE: 0.0321
ridge R2 Score: 0.4082
ridge MSE: 0.0190
decision_tree R2 Score: 0.5841
decision_tree MSE: 0.0133
Best Model: decision_tree
Best Model R2 Score: 0.5841
Best Model MSE: 0.0133


In [9]:
# Load new dataset for predictions
new_df = pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_test_without_response_variable_data_v1.csv')

# Handle missing values in the new dataset
new_df.fillna(new_df.mean(), inplace=True)

# Encode categorical features in the new dataset
for column in new_df.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        new_df[column] = label_encoders[column].transform(new_df[column])

#To add new features 
new_df['avg_fuel_economy']=(new_df['city_fuel_economy']+new_df['highway_fuel_economy'])/2
new_df['age']=2024-new_df['year']
new_df['mileage_per_year']=(new_df['mileage']/new_df['age']).round()
new_df['horsepower_per_cc']=(new_df['horsepower']/new_df['engine_displacement']).round(4)
new_df['economy_per_cc']=(new_df['avg_fuel_economy']/new_df['engine_displacement']).round(4)

# Separate features
X_new = new_df

# Predict with the best model
predictions = best_model.predict(X_new)

# Output predictions
predicted_price = pd.DataFrame(predictions)
print(predicted_price)


              0
0      9.601483
1      9.530390
2      9.689454
3      9.309289
4      9.505539
...         ...
19995  9.762481
19996  9.821125
19997  9.757472
19998  9.600304
19999  9.435229

[20000 rows x 1 columns]


In [10]:
response=pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_test_with_response_variable_data_v1.csv')
response['price']=np.log(response['price'])
r2 = r2_score(predicted_price, response['price'])
mse = mean_squared_error(predicted_price, response['price'])
print(r2)
print(mse)

0.3214467815275548
0.014326316804538544


In [11]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import matplotlib.pyplot as plt

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the training data
df = pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_small_train_data_v1.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df.fillna(df.mean(), inplace=True)

# Encode categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Separate features and target
X = df.drop(columns=['price'])
y = np.log(df['price'])  # Predict the natural log of the price

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ]
)

# Define models
models = {
    'linear_regression': LinearRegression(),
    'lasso': Lasso(),
    'ridge': Ridge(),
    'decision_tree': DecisionTreeRegressor()
}

# Create pipelines
pipelines = {}
for model_name, model in models.items():
    pipelines[model_name] = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

def objective(trial, model_name):
    params = {}
    if model_name == 'lasso':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'ridge':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'decision_tree':
        params['model__max_depth'] = trial.suggest_int('model__max_depth', 3, 20)
        params['model__min_samples_split'] = trial.suggest_int('model__min_samples_split', 2, 20)
        params['model__min_samples_leaf'] = trial.suggest_int('model__min_samples_leaf', 1, 20)

    model_pipeline = pipelines[model_name].set_params(**params)
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(model_pipeline, X_train, y_train, scoring='r2', cv=kf)
    return scores.mean()

# Hyperparameter tuning for each model
best_params = {}
for model_name in models.keys():
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=20)
    best_params[model_name] = study.best_params

# Train and evaluate each model with the best parameters
best_model_name = None
best_model = None
best_r2 = -np.inf
best_mse = np.inf

for model_name, model in models.items():
    model_pipeline = pipelines[model_name].set_params(**best_params[model_name])
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"{model_name} R2 Score: {r2:.4f}")
    print(f"{model_name} MSE: {mse:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_mse = mse
        best_model_name = model_name
        best_model = model_pipeline

print(f"Best Model: {best_model_name}")
print(f"Best Model R2 Score: {best_r2:.4f}")
print(f"Best Model MSE: {best_mse:.4f}")



[I 2024-07-16 15:53:41,694] A new study created in memory with name: no-name-bba17b61-0919-479f-8c95-7c45070cd950
[I 2024-07-16 15:53:42,270] Trial 0 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 15:53:42,462] Trial 1 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 15:53:42,649] Trial 2 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 15:53:42,884] Trial 3 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 15:53:43,099] Trial 4 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 15:53:43,300] Trial 5 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 15:53:43

linear_regression R2 Score: 0.4401
linear_regression MSE: 0.0180
lasso R2 Score: -0.0002
lasso MSE: 0.0321
ridge R2 Score: 0.4369
ridge MSE: 0.0181
decision_tree R2 Score: 0.5564
decision_tree MSE: 0.0142
Best Model: decision_tree
Best Model R2 Score: 0.5564
Best Model MSE: 0.0142


In [12]:
# Load new dataset for predictions
new_df = pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_test_without_response_variable_data_v1.csv')

# Handle missing values in the new dataset
new_df.fillna(new_df.mean(), inplace=True)

# Encode categorical features in the new dataset
for column in new_df.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        new_df[column] = label_encoders[column].transform(new_df[column])

# #To add new features 
# new_df['avg_fuel_economy']=(new_df['city_fuel_economy']+new_df['highway_fuel_economy'])/2
# new_df['age']=2024-new_df['year']
# new_df['mileage_per_year']=(new_df['mileage']/new_df['age']).round()
# new_df['horsepower_per_cc']=(new_df['horsepower']/new_df['engine_displacement']).round(4)
# new_df['economy_per_cc']=(new_df['avg_fuel_economy']/new_df['engine_displacement']).round(4)

# Separate features
X_new = new_df

# Predict with the best model
predictions = best_model.predict(X_new)

# Output predictions
predicted_price = pd.DataFrame(predictions)
print(predicted_price)


              0
0      9.709994
1      9.453267
2      9.700043
3      9.408087
4      9.469351
...         ...
19995  9.771404
19996  9.820669
19997  9.661565
19998  9.545772
19999  9.333764

[20000 rows x 1 columns]


In [13]:
response=pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_test_with_response_variable_data_v1.csv')
response['price']=np.log(response['price'])
r2 = r2_score(predicted_price, response['price'])
mse = mean_squared_error(predicted_price, response['price'])
print(r2)
print(mse)

0.3428313202129727
0.014094593481944594


In [14]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import matplotlib.pyplot as plt

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the training data
df = pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_small_train_data_v1.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df.fillna(df.mean(), inplace=True)

# Encode categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

#To add new features 
df['avg_fuel_economy']=(df['city_fuel_economy']+df['highway_fuel_economy'])/2
df['age']=2024-df['year']
df['mileage_per_year']=(df['mileage']/df['age']).round()
df['horsepower_per_cc']=(df['horsepower']/df['engine_displacement']).round(4)
df['economy_per_cc']=(df['avg_fuel_economy']/df['engine_displacement']).round(4)

# Separate features and target
X = df.drop(columns=['price'])
y = np.log(df['price'])  # Predict the natural log of the price

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ]
)

# Define models
models = {
    'linear_regression': LinearRegression(),
    'lasso': Lasso(),
    'ridge': Ridge(),
    'decision_tree': DecisionTreeRegressor()
}

# Create pipelines
pipelines = {}
for model_name, model in models.items():
    pipelines[model_name] = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

def objective(trial, model_name):
    params = {}
    if model_name == 'lasso':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'ridge':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'decision_tree':
        params['model__max_depth'] = trial.suggest_int('model__max_depth', 3, 20)
        params['model__min_samples_split'] = trial.suggest_int('model__min_samples_split', 2, 20)
        params['model__min_samples_leaf'] = trial.suggest_int('model__min_samples_leaf', 1, 20)

    model_pipeline = pipelines[model_name].set_params(**params)
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(model_pipeline, X_train, y_train, scoring='r2', cv=kf)
    return scores.mean()

# Hyperparameter tuning for each model
best_params = {}
for model_name in models.keys():
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=20)
    best_params[model_name] = study.best_params

# Train and evaluate each model with the best parameters
best_model_name = None
best_model = None
best_r2 = -np.inf
best_mse = np.inf

for model_name, model in models.items():
    model_pipeline = pipelines[model_name].set_params(**best_params[model_name])
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"{model_name} R2 Score: {r2:.4f}")
    print(f"{model_name} MSE: {mse:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_mse = mse
        best_model_name = model_name
        best_model = model_pipeline

print(f"Best Model: {best_model_name}")
print(f"Best Model R2 Score: {best_r2:.4f}")
print(f"Best Model MSE: {best_mse:.4f}")



[I 2024-07-16 15:55:31,532] A new study created in memory with name: no-name-55c4c8f8-b541-4a10-86cf-116a6bef3e8d
[I 2024-07-16 15:55:32,347] Trial 0 finished with value: 0.4938539224801053 and parameters: {}. Best is trial 0 with value: 0.4938539224801053.
[I 2024-07-16 15:55:32,787] Trial 1 finished with value: 0.4938539224801053 and parameters: {}. Best is trial 0 with value: 0.4938539224801053.
[I 2024-07-16 15:55:33,310] Trial 2 finished with value: 0.4938539224801053 and parameters: {}. Best is trial 0 with value: 0.4938539224801053.
[I 2024-07-16 15:55:33,776] Trial 3 finished with value: 0.4938539224801053 and parameters: {}. Best is trial 0 with value: 0.4938539224801053.
[I 2024-07-16 15:55:34,247] Trial 4 finished with value: 0.4938539224801053 and parameters: {}. Best is trial 0 with value: 0.4938539224801053.
[I 2024-07-16 15:55:34,795] Trial 5 finished with value: 0.4938539224801053 and parameters: {}. Best is trial 0 with value: 0.4938539224801053.
[I 2024-07-16 15:55:35

linear_regression R2 Score: 0.4751
linear_regression MSE: 0.0168
lasso R2 Score: -0.0002
lasso MSE: 0.0321
ridge R2 Score: 0.4745
ridge MSE: 0.0169
decision_tree R2 Score: 0.5539
decision_tree MSE: 0.0143
Best Model: decision_tree
Best Model R2 Score: 0.5539
Best Model MSE: 0.0143


In [15]:
# Load new dataset for predictions
new_df = pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_test_without_response_variable_data_v1.csv')

# Handle missing values in the new dataset
new_df.fillna(new_df.mean(), inplace=True)

# Encode categorical features in the new dataset
for column in new_df.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        new_df[column] = label_encoders[column].transform(new_df[column])

#To add new features 
new_df['avg_fuel_economy']=(new_df['city_fuel_economy']+new_df['highway_fuel_economy'])/2
new_df['age']=2024-new_df['year']
new_df['mileage_per_year']=(new_df['mileage']/new_df['age']).round()
new_df['horsepower_per_cc']=(new_df['horsepower']/new_df['engine_displacement']).round(4)
new_df['economy_per_cc']=(new_df['avg_fuel_economy']/new_df['engine_displacement']).round(4)

# Separate features
X_new = new_df

# Predict with the best model
predictions = best_model.predict(X_new)

# Output predictions
predicted_price = pd.DataFrame(predictions)
print(predicted_price)


              0
0      9.648689
1      9.331216
2      9.695336
3      9.375684
4      9.398536
...         ...
19995  9.660915
19996  9.805384
19997  9.755272
19998  9.596880
19999  9.363615

[20000 rows x 1 columns]


In [16]:
response=pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_test_with_response_variable_data_v1.csv')
response['price']=np.log(response['price'])
r2 = r2_score(predicted_price, response['price'])
mse = mean_squared_error(predicted_price, response['price'])
print(r2)
print(mse)

0.32724623650545503
0.01453250907290046


In [17]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import matplotlib.pyplot as plt

# # Setup logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the training data
df = pd.read_csv(r'C:\Users\Lenovo\Downloads\pc1_small_train_data_v1.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df.fillna(df.mean(), inplace=True)

# Encode categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Separate features and target
X = df.drop(columns=['price'])
y = np.log(df['price'])  # Predict the natural log of the price

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ]
)

# Define models
models = {
    'linear_regression': LinearRegression(),
    'lasso': Lasso(),
    'ridge': Ridge(),
    'decision_tree': DecisionTreeRegressor()
}

# Create pipelines
pipelines = {}
for model_name, model in models.items():
    pipelines[model_name] = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

def objective(trial, model_name):
    params = {}
    if model_name == 'lasso':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'ridge':
        params['model__alpha'] = trial.suggest_float('model__alpha', 1e-3, 10.0)
    elif model_name == 'decision_tree':
        params['model__max_depth'] = trial.suggest_int('model__max_depth', 3, 20)
        params['model__min_samples_split'] = trial.suggest_int('model__min_samples_split', 2, 20)
        params['model__min_samples_leaf'] = trial.suggest_int('model__min_samples_leaf', 1, 20)

    model_pipeline = pipelines[model_name].set_params(**params)
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(model_pipeline, X_train, y_train, scoring='r2', cv=kf)
    return scores.mean()

# Hyperparameter tuning for each model
best_params = {}
for model_name in models.keys():
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=20)
    best_params[model_name] = study.best_params

# Train and evaluate each model with the best parameters
best_model_name = None
best_model = None
best_r2 = -np.inf
best_mse = np.inf

for model_name, model in models.items():
    model_pipeline = pipelines[model_name].set_params(**best_params[model_name])
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"{model_name} R2 Score: {r2:.4f}")
    print(f"{model_name} MSE: {mse:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_mse = mse
        best_model_name = model_name
        best_model = model_pipeline

print(f"Best Model: {best_model_name}")
print(f"Best Model R2 Score: {best_r2:.4f}")
print(f"Best Model MSE: {best_mse:.4f}")



[I 2024-07-16 16:09:47,548] A new study created in memory with name: no-name-5f77cdc6-0cb4-4174-a6d6-4e17881316ab
[I 2024-07-16 16:09:48,058] Trial 0 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 16:09:48,264] Trial 1 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 16:09:48,478] Trial 2 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 16:09:48,667] Trial 3 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 16:09:48,871] Trial 4 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 16:09:49,071] Trial 5 finished with value: 0.4588067516999164 and parameters: {}. Best is trial 0 with value: 0.4588067516999164.
[I 2024-07-16 16:09:49

linear_regression R2 Score: 0.4401
linear_regression MSE: 0.0180
lasso R2 Score: -0.0002
lasso MSE: 0.0321
ridge R2 Score: 0.4373
ridge MSE: 0.0181
decision_tree R2 Score: 0.5565
decision_tree MSE: 0.0142
Best Model: decision_tree
Best Model R2 Score: 0.5565
Best Model MSE: 0.0142
