In [None]:
#Import all the necessary packages
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
#Read the dataset
data = pd.read_csv("../input/car-price-prediction/CarPrice_Assignment.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
#Check the the data has any null values
data.isnull().sum()

In [None]:
data.nunique()

In [None]:
data.drop(['car_ID','CarName'], axis=1, inplace=True)

In [None]:
#Check the correlation between each of the columns
data_corr = data.corr()
#Better to visualize the correlation using a heatMap
plt.figure(figsize=(14,7))
sns.heatmap(data_corr,annot=True,cmap='coolwarm')

In [None]:
cat_cols= [col for col in data.columns if data[col].dtype=='object']
num_cols= [col for col in data.columns if data[col].dtype!='object']
num_cols.remove('price')


In [None]:
data.head()

In [None]:
sns.histplot(data=data, x="price", hue="fueltype")

In [None]:
i=1
plt.figure(figsize=(30,100))
for col in cat_cols:
    plt.subplot(10,2,i)
    sns.countplot(data[col])
    plt.xticks(rotation=90, fontsize=15)
    plt.yticks(fontsize=15)
    plt.xlabel(col, fontsize=15)
    plt.ylabel('count',fontsize=15)
    
    i+=1
    plt.subplot(10,2,i)
    sns.boxplot(x=data[col], y=data['price'])
    plt.xticks(rotation=90, fontsize=15)
    plt.yticks(fontsize=15)
    plt.xlabel(col, fontsize=15)
    plt.ylabel('price',fontsize=15)
    i+=1
plt.show()

In [None]:
i=1
plt.figure(figsize=(25,100))
for col in num_cols:
    plt.subplot(16,2,i)
    sns.distplot(data[col])
    plt.xlabel(col,fontsize=15)
    plt.xticks(fontsize=10)
    i+=1
    
    plt.subplot(16,2,i)
    sns.scatterplot(x=data[col], y=data['price'])
    plt.xlabel(col,fontsize=15)
    plt.xticks(fontsize=10)
    plt.ylabel('price', fontsize=15)
    plt.yticks(fontsize=10)
    i+=1

In [None]:
from scipy import stats

#num_columns = df.select_dtypes(exclude='object').columns

for i in list(num_cols):
    pearson_coeff, p_value = stats.pearsonr(data[i], data['price'])
    print(i.capitalize())
    print(f'Pearson Co-relation: {pearson_coeff}')
    print(f'P-Value: {p_value}')
    if p_value<0.05:
        print('Correlation is Significant')
    else:
        print('Correlation is Insignificant')
    print('')

## **Data Preprocessing**

In [None]:
data.drop('symboling', axis=1, inplace=True)
num_cols.remove('symboling')

In [None]:
data.head()

In [None]:
cat_cols

In [None]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()

data[cat_cols]= data[cat_cols].apply(lambda x: le.fit_transform(x))

In [None]:
data.head()

In [None]:
from sklearn.preprocessing import StandardScaler

ss=StandardScaler()
data[num_cols]= ss.fit_transform(data[num_cols])

In [None]:
data.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
X= data.drop('price', axis=1)
y=data['price']

X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2, random_state= 42)

In [None]:
def regression(X_train,X_test,y_train,y_test ):
    lr= LinearRegression()
    lr.fit(X_train,y_train)
    y_pred= lr.predict(X_test)

    rmse= (mean_squared_error(y_test,y_pred))**(1/2)
    r2= r2_score(y_test,y_pred)

    print("Linear Regression REMSE :" , rmse)
    print("Linear Regression R2 :" ,r2)
    print("\n")
    
    dt= DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=100, min_samples_leaf= 5, random_state=42)
    dt.fit(X_train,y_train)
    y_pred= dt.predict(X_test)

    rmse_train= (mean_squared_error(dt.predict(X_train),y_train))**(1/2)
    rmse= (mean_squared_error(y_test,y_pred))**(1/2)
    r2= r2_score(y_test,y_pred)
    print("Decision Tree Regressor RMSE train :",rmse_train)
    print("Decision Tree Regressor RMSE :",rmse)
    print("Decision Tree Regressor R2 :",r2)
    print("\n")
    
    rf= RandomForestRegressor(max_depth=10, criterion='mse', min_samples_leaf=2, random_state=42, verbose=1)
    rf.fit(X_train,y_train)
    y_pred= rf.predict(X_test)
    rmse_train= (mean_squared_error(rf.predict(X_train),y_train))**(1/2)
    rmse= (mean_squared_error(y_test,y_pred))**(1/2)
    r2= r2_score(y_test,y_pred)
    print("Random Forest Regressor RMSE train :",rmse_train)
    print("Random Forest Regressor RMSE :",rmse)
    print("Random Forest Regressor R2 :",r2)
    print("\n")
    
    regressor = SVR(kernel='rbf')
    regressor.fit(X_train,y_train)
    mse_train= (mean_squared_error(rf.predict(X_train),y_train))**(1/2)
    rmse= (mean_squared_error(y_test,y_pred))**(1/2)
    r2= r2_score(y_test,y_pred)
    print("Support Vector Regressor RMSE train :",rmse_train)
    print("Support Vector Regressor RMSE :",rmse)
    print("Support Vector Regressor R2 :",r2)
    print("\n")
    
    
    xgb= XGBRegressor()
    xgb.fit(X_train, y_train)
    y_pred= xgb.predict(X_test)
    rmse_train= (mean_squared_error(xgb.predict(X_train),y_train))**(1/2)
    rmse= (mean_squared_error(y_test,y_pred))**(1/2)
    r2= r2_score(y_test,y_pred)
    print("XG Regressor RMSE train :",rmse_train)
    print("XG Regressor RMSE  :",rmse)
    print("XG Regressor R2 :",r2)
    print("\n")
    
    model_ABR = AdaBoostRegressor()
    model_ABR.fit(X_train, y_train)
    rmse_train= (mean_squared_error(xgb.predict(X_train),y_train))**(1/2)
    rmse= (mean_squared_error(y_test,y_pred))**(1/2)
    r2= r2_score(y_test,y_pred)
    print("Ada Boots Regressor RMSE train :",rmse_train)
    print("Ada Boots Regressor RMSE  :",rmse)
    print("Ada Boots Regressor R2 :",r2)
    
    

In [None]:
regression(X_train,X_test,y_train,y_test)