In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# **Exploratory Data Analysis**

In [None]:
data = pd.read_csv("../input/cardataset/data.csv")
data

In [None]:
duplicate_rows_df = data[data.duplicated()]
print("number of dupicate rows:", duplicate_rows_df.shape)

In [None]:
data.count()

In [None]:
data = data.drop_duplicates()
data.head(5)

In [None]:
data.count()

In [None]:
data = data.dropna()    # Dropping the missing values.
data.count()

In [None]:
print(data.isnull().sum())  

In [None]:
sns.boxplot( x=data['Engine HP'] )

In [None]:
sns.boxplot( x=data['Engine Cylinders'] )

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1  #interquantile range 
print(IQR)

In [None]:
data = data[~((data< (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
data.shape

In [None]:
data.Make.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title("Number of cars by make")
plt.ylabel('Number of cars')
plt.xlabel('Make');

In [None]:
plt.figure(figsize=(10,5))
c = data.corr()
sns.heatmap(c,cmap='BrBG', annot= True)
c

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(data['Engine HP'], data['MSRP'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()

In [None]:
cols_new = ['Engine HP','Engine Cylinders']
for column in cols_new:
   plt.figure(figsize=(8,5))
   sns.boxplot(x=data[column])
   plt.title(f'Boxplot for {column}')
   plt.show()

In [None]:
data.head(5)

In [None]:
data.loc[data['city mpg'].sort_values(ascending=True).index].head(10)[['Make','Model','Year','highway MPG','city mpg','MSRP']]

In [None]:
data.info()

In [None]:
def multihot_encode(df, column):
    df = df.copy()
    
    df[column] = df[column].apply(lambda x: x.split(','))
    
    all_categories = np.unique(df[column].sum())
    
    for category in all_categories:
        df[column + '_' + category] = df.apply(lambda x: 1 if category in x[column] else 0, axis=1)
    
    df = df.drop(column, axis=1)
    
    return df

In [None]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Fill multi-hot column missing values
    df['Market Category'] = df['Market Category'].fillna("Missing")
    
    # Multi-hot encoding
    df = multihot_encode(df, column='Market Category')
    
    # One-hot encoding
    for column in df.select_dtypes('object').columns:
        df = onehot_encode(df, column=column)
    
    # Fill remaining missing values
    df['Engine HP'] = df['Engine HP'].fillna(df['Engine HP'].mean())
    for column in ['Engine Cylinders', 'Number of Doors']:
        df[column] = df[column].fillna(df[column].mode()[0])
    
    # Split df into X and y
    y = df['MSRP']
    X = df.drop('MSRP', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

In [None]:
n_components = 100

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_reduced = pd.DataFrame(pca.transform(X_train), index=X_train.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])
X_test_reduced = pd.DataFrame(pca.transform(X_test), index=X_test.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])

In [None]:
X_train_reduced

In [None]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train_reduced, y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test_reduced, y_test)))