In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [None]:
#loading Dataset
df=pd.read_csv('/kaggle/input/carprice-assignment/CarPrice_Assignment.csv')

In [None]:
df.info()

In [None]:
df.head(10)

In [None]:
#checking some stats
df.describe()

In [None]:
#checking if there are any null values
df.isnull().sum()

In [None]:
#how many unique values are there

for feature in df.columns:
  if df[feature].nunique()<25:
    print('Total No. of unique features of ', feature,' are -',df[feature].nunique(),df[feature].unique())

In [None]:
#we dont need carID column
df=df.drop('car_ID',axis=1)
df=df.drop('CarName',axis=1)

In [None]:
#from unique values I can see there is one value in engine type which is not related to enginetype, hence lets see how many values are there
df[df['enginetype']=='l']

In [None]:
#replacing l with most frequent value in engine type
df['enginetype']=df['enginetype'].replace(['l'],'ohc')

 ****lets see numerical and categorical features from datasets


In [None]:
numerical_feature=[feature for feature in df.columns if df[feature].dtypes !='object']
print('total numerical features are',len(numerical_feature))

categorical_feature=[feature for feature in df.columns if df[feature].dtypes=='object']
print('total categorical features are',len(categorical_feature))

In [None]:
#plotting numerical features with target variable to see how see what is the relation

for feature in numerical_feature:
  sns.scatterplot(x=df[feature],y=df['price'],data=df)
  plt.xlabel(feature)
  plt.ylabel('Car Price')
  plt.title('Car Price relation with different features')
  plt.show()

In [None]:
#now lets find out if there are any outliers
for feature in numerical_feature:
  sns.boxplot(x=df[feature])
  plt.xlabel(feature)
  plt.ylabel('Car Price')
  plt.title('price relation with different features')
  plt.show()

In [None]:
#lets visulise relation with categorical features
for feature in categorical_feature:
  sns.barplot(x=df[feature],y=df['price'],data=df)
  plt.xlabel(feature)
  plt.ylabel('Car Price')
  plt.title('price relation with different features')
  plt.show()

In [None]:
#lets see some distribution of the data. Univariate Analysis
for feature in numerical_feature:
  sns.histplot(x=df[feature],kde=True)
  plt.xlabel(feature)
  plt.ylabel('frequency')
  plt.title('price relation with different features')
  plt.show()

In [None]:
#some features are negatively correlated with target variable or they are not correlated it would be best to drop them

df=df.drop('citympg',axis=1)
df=df.drop('compressionratio',axis=1)
df=df.drop('stroke',axis=1)
df=df.drop('symboling',axis=1)
df=df.drop('carheight',axis=1)
df=df.drop('peakrpm',axis=1)

In [None]:
plt.figure(figsize=(15,25))
sns.heatmap(df.corr(),annot=True)

#As we can see there are many columns which are correlated so we can remove them
Wheelbase- Carlength,
Carlegth-curbweight,
carwidth-curbweight,
curbweight-enginesize,
enginesize-price,
highwaympg-citympg

In [None]:
#all columns are 85% more than correlated with other columns hence these should be removed
df=df.drop(columns=['carlength','curbweight','enginesize','highwaympg'],axis=1)

In [None]:
  #price feature has some outliers so I m using IQR to remove outliers. Outliers can badly influence the model.
  Q1_price=np.percentile(df['price'],25,interpolation='midpoint')
  Q2_price=np.percentile(df['price'],50,interpolation='midpoint')
  Q3_price=np.percentile(df['price'],75,interpolation='midpoint')

  print('Q1,Q2,Q3 =',Q1_price,Q2_price,Q3_price)

  IQR_price=Q3_price-Q1_price
  print('IQR for price is',IQR_price)

  upper_bound=1.5*IQR_price+Q3_price
  lower_bound=Q1_price-1.5*IQR_price
  print('upper bound for price is',upper_bound)
  print('lower bound for price is',lower_bound)

In [None]:
upper_array=np.where(df['price']>=upper_bound)[0]
lower_array=np.where(df['price']<=lower_bound)[0]

df.drop(index=upper_array,inplace=True)
df.drop(index=lower_array,inplace=True)

In [None]:
#now lets see numerical & categorical features again
numerical_feature=[feature for feature in df.columns if df[feature].dtypes !='object']
print('total numerical features are',len(numerical_feature))

categorical_feature=[feature for feature in df.columns if df[feature].dtypes=='object']
print('total categorical features are',len(categorical_feature))

In [None]:
x=df.drop('price',axis=1)
y=df['price']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=25)

In [None]:
numerical_feature=x.select_dtypes(include='number').columns
categorical_feature=x.select_dtypes(include='object').columns

In [None]:
#using automated Pipeline to encode and scale features
pipeline_num=Pipeline(steps=[('scalar',StandardScaler())])
pipeline_cat=Pipeline(steps=[('encoder',OneHotEncoder(handle_unknown='ignore'))])

transformer=ColumnTransformer(transformers=[('numeric',pipeline_num,numerical_feature),('categorical',pipeline_cat,categorical_feature)])

In [None]:
sc=StandardScaler()
y_train_sc=sc.fit_transform(np.array(y_train).reshape(-1,1))
y_test_sc=sc.transform(np.array(y_test).reshape(-1,1))

In [None]:
#building first model with Linear regression
model1=LinearRegression()
pipeline_model=Pipeline(steps=[('transformer',transformer),('model',model1)])
prepared_model=pipeline_model.fit(x_train,y_train_sc)


In [None]:
predicted_prices=pipeline_model.predict(x_test)

In [None]:
#Lets see accuracy
from sklearn.metrics import r2_score, mean_squared_error
r2_score(y_test_sc,predicted_prices)

In [None]:
#Now I am trying different model where scaling the feature mannualy 
x=pd.get_dummies(x)
x_train2,x_test2,y_train2,y_test2=train_test_split(x,y,test_size=0.20,random_state=20)

In [None]:
sc1=StandardScaler()
sc2=StandardScaler()
x_train_sc=sc1.fit_transform(x_train2)
y_train_sc=sc2.fit_transform(np.array(y_train2).reshape(-1,1))
x_test_sc=sc1.transform(x_test2)
y_test_sc=sc2.transform(np.array(y_test2).reshape(-1,1))

In [None]:
model2=LinearRegression()
model2.fit(x_train_sc,y_train_sc)

In [None]:
predicted_prices_2=model2.predict(x_test_sc)
r2_score(y_test_sc,predicted_prices_2)

In [None]:
model3=DecisionTreeRegressor()
model3.fit(x_train_sc,y_train_sc)
predicted_prices_dt=model3.predict(x_test_sc)
r2_score(predicted_prices_dt,y_test_sc)

In [None]:
#with the help of heypertuning I will try to findout best parameters for Descion tree Model

param={'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
       'splitter':['best','random'],
       'max_depth':[3,6,9,12,34],
       'min_samples_split':[2,6,10,12]}
from sklearn.model_selection import GridSearchCV

In [None]:
hype_tuning=GridSearchCV(estimator=model3,param_grid=param,cv=5,scoring='neg_mean_squared_error')
hype_tuning.fit(x_train2,y_train2)

In [None]:
best_params=hype_tuning.best_params_
print(best_params)

In [None]:
best_model=hype_tuning.best_estimator_
test_score=best_model.score(x_test2,y_test2)
print(best_model)
print(test_score)

In [None]:
best_model_final=DecisionTreeRegressor(criterion='absolute_error', max_depth=34,
                      min_samples_split=6, splitter='random')

In [None]:
best_model_final.fit(x_train_sc,y_train_sc)
predicted_prices_final=best_model_final.predict(x_test_sc)
score=r2_score(predicted_prices_final,y_test_sc)
print(score)