In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/kaggle/input/car-price-prediction/CarPrice_Assignment.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.nunique()

In [None]:
df['enginetype'].unique()

In [None]:
df['cylindernumber'].unique()

In [None]:
df['enginetype'].value_counts()

In [None]:
df['cylindernumber']=df['cylindernumber'].map({'four':4,'six':6,'five':5,'eight':8,'two':2,'three':3,'twelve':12})

In [None]:
df['cylindernumber'].dtype

In [None]:
df['enginetype']=df['enginetype'].map({'ohc':1,'ohcf':2,'ohcv':3,'dohc':4,'l':5,'rotar':6,'dohcv':7})

In [None]:
sns.boxplot(data=df,x='cylindernumber',y='price')

In [None]:
sns.boxplot(data=df,x='enginelocation',y='price')

In [None]:
sns.boxplot(data=df,x='carbody',y='price')

In [None]:
plt.scatter(x='compressionratio',y='price',data=df)

In [None]:
df.head(1)

In [None]:
sns.boxplot(data=df,x='symboling',y='price')

In [None]:
sns.scatterplot(x='price',y='curbweight',hue='doornumber',data=df)

In [None]:
plt.rcParams['figure.figsize']=(12,12)
corr=df.corr()
sns.heatmap(corr,fmt='.2f',annot=True,cmap=plt.cm.Blues)

In [None]:
df_corr=df.corr().abs()
df_corr



In [None]:
upper=df_corr.where(np.triu(np.ones(df_corr.shape),k=1).astype(np.bool))
upper


In [None]:
to_drop=[column for column in upper.columns if any (upper[column]>0.95)]
print('----------------------------')
print(to_drop)



In [None]:
df1=df.drop(to_drop,axis=1)
df1.columns

In [None]:
df1.drop('car_ID',axis=1,inplace=True)

In [None]:
df1.info()

In [None]:
df1.nunique()

In [None]:
dummies=pd.get_dummies(df1[['fueltype','aspiration','doornumber','carbody','drivewheel',
                            'enginelocation','fuelsystem']])

In [None]:
dummies.head(2)

In [None]:
df1=pd.concat([df1,dummies],axis=1)

In [None]:
df1.drop(['fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','fuelsystem'],
         axis=1,inplace=True)

In [None]:
#checking for null values in enginetype feature
df1[df1['enginetype'].isnull()]

In [None]:
#filling NaN values with most common enginetype which is 1
df1['enginetype']=df1['enginetype'].fillna('1')

In [None]:
df1[df1['enginetype'].isnull()]

In [None]:
df1['enginetype']=df1['enginetype'].astype('int')

In [None]:
df1.info()

In [None]:
X=df1.loc[:,df1.columns!='price']
y=df1.loc[:,'price']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1234)

In [None]:
X_train.drop('CarName',axis=1,inplace=True)
X_test.drop('CarName',axis=1,inplace=True)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error


In [None]:
lr=LinearRegression()
lr.fit(X_train,y_train)

In [None]:
lr.score(X_train,y_train)

In [None]:
lr_pred=lr.predict(X_test)
print('MSE:',mean_squared_error(lr_pred,y_test))
print('MAE:',mean_absolute_error(lr_pred,y_test))
print('r2_score:',r2_score(lr_pred,y_test))

In [None]:
prediction=pd.DataFrame({'Actual':y_test,'Predicted':lr_pred})

In [None]:
prediction.head(10)

In [None]:
lr.coef_

# SVM

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [None]:
svc=SVR()
svc.fit(X_train,y_train)

In [None]:
svc.score(X_train,y_train)

In [None]:
svc_pred=svc.predict(X_test)
print('MSE:',mean_squared_error(svc_pred,y_test))
print('MAE:',mean_absolute_error(svc_pred,y_test))
print('r2_score:',r2_score(svc_pred,y_test))

### - Scaling the data before training by SVR

In [None]:
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
svr=SVR()
svr.fit(X_train_scaled,y_train)

In [None]:
svr.score(X_train_scaled,y_train)

In [None]:
svr_pred=svr.predict(X_test_scaled)


In [None]:
print('MSE:',mean_squared_error(svr_pred,y_test))
print('MAE:',mean_absolute_error(svr_pred,y_test))
print('r2_score:',r2_score(svr_pred,y_test))

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)

In [None]:
rf.score(X_train,y_train)

In [None]:
rf_pred=rf.predict(X_test)
print('MSE:',mean_squared_error(rf_pred,y_test))
print('MAE:',mean_absolute_error(rf_pred,y_test))
print('r2_score:',r2_score(rf_pred,y_test))

In [None]:
prediction_rf=pd.DataFrame({'Actual':y_test,'Predicted':rf_pred})
prediction_rf.head(10)

# Bagging Regressor

In [None]:
from sklearn.ensemble import BaggingRegressor

In [None]:
bag=BaggingRegressor()
bag.fit(X_train,y_train)
bag.score(X_train,y_train)

In [None]:
bag_pred=bag.predict(X_test)
print('MSE:',mean_squared_error(bag_pred,y_test))
print('MAE:',mean_absolute_error(bag_pred,y_test))
print('r2_score:',r2_score(bag_pred,y_test))

In [None]:
#Support vector Regressor performs bad and it is less generally used in regression problems
#Linear regression and random forest gives good prediction accuracy on the data