In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error

In [None]:
# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
# Inspect the data
print(train_df.info())
print(test_df.info())

In [None]:
# Feature Engineering
# Assuming 'price' is the target variable and 'id' is an identifier

# Drop irrelevant columns
columns_to_drop = ['irrelevant_column1', 'irrelevant_column2'] # Replace with actual column names to drop
train_df = train_df.drop(columns=columns_to_drop, errors='ignore')
test_df = test_df.drop(columns=columns_to_drop, errors='ignore')

In [None]:
# Transform 'engine' and 'transmission' columns
def transform_engine(engine):
    if 'V6' in engine:
        return 'V6'
    elif 'V8' in engine:
        return 'V8'
    elif 'Straight' in engine:
        return 'Straight'
    else:
        return 'Other'

def transform_transmission(transmission):
    if 'Automatic' in transmission:
        return 'Automatic'
    else:
        return 'Manual'

In [None]:
train_df['engine_shape'] = train_df['engine'].apply(transform_engine)
train_df['transmission_type'] = train_df['transmission'].apply(transform_transmission)

In [None]:
test_df['engine_shape'] = test_df['engine'].apply(transform_engine)
test_df['transmission_type'] = test_df['transmission'].apply(transform_transmission)


In [None]:
# Drop original 'engine' and 'transmission' columns
train_df = train_df.drop(['engine', 'transmission'], axis=1)
test_df = test_df.drop(['engine', 'transmission'], axis=1)


In [None]:
# Define features and target variable
X = train_df.drop(['price', 'id'], axis=1)
y = train_df['price']
X_test = test_df.drop(['id'], axis=1)

In [None]:
# Preprocessing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Models
models = {
    "Linear Regression": Pipeline(steps=[('preprocessor', preprocessor),
                                         ('regressor', LinearRegression())]),
    "Polynomial Regression": Pipeline(steps=[('preprocessor', preprocessor),
                                             ('poly', PolynomialFeatures(degree=2)),
                                             ('regressor', LinearRegression())]),
    "SVR": Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', SVR())]),
    "Random Forest": Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', RandomForestRegressor())]),
    "XGBoost": Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', XGBRegressor(objective='reg:squarederror'))])
}

In [None]:
# Evaluate models using cross-validation
results = {}
for name, model in models.items():
    cv_results = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')
    results[name] = -np.mean(cv_results)
    print(f'{name}: RMSE = {-np.mean(cv_results)}')

In [None]:

# Select the best model
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]
print(f'Best model: {best_model_name}')


In [None]:
# Train the best model on the entire training set
best_model.fit(X, y)


In [None]:

# Predict on test set
predictions = best_model.predict(X_test)

In [None]:
# Create submission file
submission = pd.DataFrame({'id': test_df['id'], 'price': predictions})
submission.to_csv('testing.csv', index=False)
print('Submission file created successfully.')
