In [24]:
# Import libraries
!pip install category_encoders
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
import numpy as np
import re
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
import os



In [25]:
# Load the data
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [26]:
# Function to limit categories to the top 5 most common
def limit_categories(df, column):
    top_5 = df[column].value_counts().index[:5]
    df[column] = df[column].apply(lambda x: x if x in top_5 else 'Other')
    return df

In [27]:
# Apply the function to both int_col and ext_col in train and test datasets
train_df = limit_categories(train_df, 'int_col')
train_df = limit_categories(train_df, 'ext_col')
test_df = limit_categories(test_df, 'int_col')
test_df = limit_categories(test_df, 'ext_col')

In [None]:
# Extract numerical values from the engine column
def extract_engine_power(engine_str):
    match = re.search(r'(\d+(\.\d+)?)HP', engine_str)
    if match:
        return float(match.group(1))
    return 0

In [28]:
# Apply the above function
train_df['engine_power'] = train_df['engine'].apply(extract_engine_power)
test_df['engine_power'] = test_df['engine'].apply(extract_engine_power)

In [29]:
# Drop the original engine column
train_df.drop(columns=['engine'], inplace=True)
test_df.drop(columns=['engine'], inplace=True)

In [30]:
# Add age of the car
train_df['age'] = 2024 - train_df['model_year']
test_df['age'] = 2024 - test_df['model_year']

In [31]:
# Remove outliers based on the price column
q1 = train_df['price'].quantile(0.25)
q3 = train_df['price'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
train_df = train_df[(train_df['price'] >= lower_bound) & (train_df['price'] <= upper_bound)]

In [32]:
# Separate the target column from the training set
y = train_df['price']
X_train_full = train_df.drop(columns=['price'])

In [33]:
# Encoding categorical variables
categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
encoder = ce.OneHotEncoder(cols=categorical_features, handle_unknown='ignore', use_cat_names=True)
X_train_encoded = encoder.fit_transform(X_train_full)
X_test_encoded = encoder.transform(test_df)

In [34]:
# Ensure the test set has the same columns as the train set
train_columns = set(X_train_encoded.columns)
test_columns = set(X_test_encoded.columns)

missing_train_cols = test_columns - train_columns
missing_test_cols = train_columns - test_columns

for col in missing_train_cols:
    X_train_encoded[col] = 0
for col in missing_test_cols:
    X_test_encoded[col] = 0

In [35]:
# Reorder columns to match
X_test_encoded = X_test_encoded[X_train_encoded.columns]

In [36]:
# Standardize numerical features
numerical_features = ['model_year', 'milage', 'engine_power', 'age']
scaler = StandardScaler()
X_train_encoded[numerical_features] = scaler.fit_transform(X_train_encoded[numerical_features])
X_test_encoded[numerical_features] = scaler.transform(X_test_encoded[numerical_features])

In [37]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_train_encoded, y, test_size=0.2, random_state=42)

In [38]:
# Implement k-fold cross-validation for hyperparameter tuning
kf = KFold(n_splits=3, shuffle=True, random_state=42)

In [39]:
# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor()

In [40]:
# Set up the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8]
}

In [None]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=kf, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


  pid = os.fork()


In [None]:
# Get the best model
best_xgb_model = grid_search.best_estimator_

In [None]:
# Evaluate the best model on the validation set
val_predictions = best_xgb_model.predict(X_val)
mse = mean_squared_error(y_val, val_predictions)
r2 = r2_score(y_val, val_predictions)

print(f'MSE: {mse}')
print(f'R2 Score: {r2}')

In [None]:
# Make predictions on the test set
test_predictions = best_xgb_model.predict(X_test_encoded)

# Create a DataFrame for the results
submission_df = pd.DataFrame({'id': test_df['id'], 'price': test_predictions.flatten()})

# Save predictions to a CSV file in the required format
submission_df.to_csv('Test_Predictions_Keras_Final_Best.csv', index=False)