### Load libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
sns.set_style('darkgrid')

### Read the dataset

In [None]:
df = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')
df.head()

**Car_Name** : This column should be filled with the name of the car.

**Year** : This column should be filled with the year in which the car was bought.

**Selling_Price** : This column should be filled with the price the owner wants to sell the car at.

**Present_Price** : This is the current ex-showroom price of the car.

**Kms_Driven** : This is the distance completed by the car in km.

**Fuel_Type**: Fuel type of the car i.e Diesel,Petrol,CNG

**Seller_Type**: Defines whether the seller is a dealer or an individual.

**Transmission** : Defines whether the car is manual or automatic.

**Owner** : Defines the number of owners the car has previously had.

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

### EDA

#### Checking null values

In [None]:
df.isnull().sum()

* There are no non null values present in the columns

#### Visualizing and Analyzing data

In [None]:
a=['Car_Name','Fuel_Type','Seller_Type','Transmission']

In [None]:
# Showing how many unique values are there in categorical columns
for i in a:
    print(i ,len(df[i].unique()))
    

In [None]:
# Since there are 98 unique different items in the column Car_Names we will drop that column.
df=df.drop('Car_Name',axis=1)

In [None]:
# Percentage of each category in Fuel_Type,Seller_Type,Transmission
fig, axes=plt.subplots(1,3,figsize=(15,10))
df['Fuel_Type'].value_counts().plot(kind='pie',autopct='%.3f%%',ax=axes[0],textprops={'fontsize': 13})
df['Seller_Type'].value_counts().plot(kind='pie',autopct='%.3f%%',ax=axes[1],textprops={'fontsize': 13})
df['Transmission'].value_counts().plot(kind='pie',autopct='%.3f%%',ax=axes[2],textprops={'fontsize': 13})
plt.show()

#### Inference:
    * The petrol cars grabs 79.4% market share which is much more than diesel and CNG cars.
    * The dealer selling cars (64.8% )are more than the individual selling cars.
    * Manual Transmission cars(86.7%) are more than automatic transmission cars.
    

In [None]:
# Number of previous owners
sns.countplot(df['Owner'],palette='husl')
plt.show()

* Cars with no owner is much more than the cars having previous owner(s).

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(),annot=True)
plt.show()

* Selling price have a very high positive correlation with present price which implies that higher the present price higher the selling price.

In [None]:
# Distribution of Selling_Price, Year, Present_Price, Kms_Driven
fig, axes=plt.subplots(2,2,figsize=(15,5))
sns.distplot(df['Selling_Price'],ax=axes[0,0])
sns.distplot(df['Year'],ax=axes[0,1])
sns.distplot(df['Present_Price'],ax=axes[1,0])
sns.distplot(df['Kms_Driven'],ax=axes[1,1])

plt.show()

#### Inference:
  * Majority of the selling price in low range except for few models.
  * Most of the cars are of the year 2010 - 2018.
  * The present price follows a similar trend as the selling price.
  * Most of the cars Kms driven lie below 100000.
        
    

In [None]:
# Plotting 'Year', 'Present_Price', 'Kms_Driven', 'Owner' against the traget variable selling price to find their relation
sns.pairplot(df,x_vars=['Year', 'Present_Price', 'Kms_Driven', 'Owner'],y_vars=['Selling_Price'],height=4)
plt.show()

#### Inference:

* The newer models have higher selling price as compared to old models. 

* The present price and the selling price have a linear relationship, as the present price increases the selling price also increases.

* Selling price is a bit higher when the Kms driven are low.

* The cars with no owner have much higher selling price as compared with used cars.  

In [None]:
# To find further inference 
sns.lmplot(x='Present_Price',y='Selling_Price',data=df, fit_reg=False,col='Transmission',hue='Fuel_Type',height=4,aspect=1.5)
plt.show()

#### Inference: 
   *  There is no CNG cars with automatic transmission.
   *  Cars with automatic transmission and with fuel type diesel have higher selling price.
   
   

### Outliers

In [None]:
# Count of outliers in each numerical column
a=['Year','Selling_Price','Present_Price','Kms_Driven','Owner']

for i in a:
    q1 = df[i].quantile(0.25)
    q3 = df[i].quantile(0.75)
    iqr = q3-q1

    UL = q3 + (1.5 * iqr)
    LL = q1 - (1.5 * iqr)
    print(i,df[(df[i]>UL) | (df[i]<LL)].count()[i])
    #print(cars[(cars[i]>UL) | (cars[i]<LL)][i])


In [None]:
# Outliers of Selling price
q1 = df['Selling_Price'].quantile(0.25)
q3 = df['Selling_Price'].quantile(0.75)
iqr = q3-q1
UL = q3 + (1.5 * iqr)
LL = q1 - (1.5 * iqr)
df[(df['Selling_Price']>UL) | (df['Selling_Price']<LL)].sort_index()


In [None]:
# Outliers of Present Price
q1 = df['Present_Price'].quantile(0.25)
q3 = df['Present_Price'].quantile(0.75)
iqr = q3-q1
UL = q3 + (1.5 * iqr)
LL = q1 - (1.5 * iqr)
df[(df['Present_Price']>UL) | (df['Present_Price']<LL)].sort_index()


* Note that Selling_Price and Present_Price have almost the same data points as the outliers which means that these outliers represents important information that are rare. Hence we cannot ignore or remove these outliers.


In [None]:
# Visual representation of outliers
fig, axes=plt.subplots(2,2,figsize=(15,8))
sns.boxplot('Selling_Price',data=df,ax=axes[0,0])
sns.boxplot('Year',data=df,ax=axes[0,1])
sns.boxplot('Present_Price',data=df,ax=axes[1,0])
sns.boxplot('Kms_Driven',data=df,ax=axes[1,1])
plt.show()


### Changing the categorical value to numerical

In [None]:
df=pd.get_dummies(df,drop_first=True)

In [None]:
df.info()

### Model building 

* The dependent variable will be 'Selling_price' rest all the variables will be cosidered as independent variables.

In [None]:
X=df.drop('Selling_Price',axis=1)
y=df['Selling_Price']
X.head()

### 1. Linear Regression


In [None]:
# Standardizing the data by taking mean to 0 and standard deviation to 1.
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_std = ss.fit_transform(X)
X_std=pd.DataFrame(X_std, columns=X.columns)

In [None]:
import statsmodels.api as sm
Xc=sm.add_constant(X_std)
ols=sm.OLS(y,Xc)
model=ols.fit()
model.summary()

#### Inference: 

* R2= 0.883 implier that 88.3 % of the variation in selling price is explained by the independent variables.

* Probability of F-stats = 0 implies that atleast one of the features plays a  significant role in predicting selling price.

* From the Pvalues we can also say that except Fuel type and owner all the other variables plays a significant role in predicting the model.


In [None]:
y_pred=model.predict(Xc)

In [None]:
# Plotting predicted values vs actual values
plt.scatter(y_pred,y)
plt.plot(y_pred,y_pred,'r')
plt.xlabel('y predicted')
plt.ylabel('y actual')
plt.show()

### Checking assumptions

#### 1. Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif=pd.DataFrame()
vif['VIF']=[variance_inflation_factor(X_std.values,i) for i in range(X_std.shape[1])]
vif['feature']=X_std.columns
vif.sort_values('VIF',ascending=False)

There is high vif values for few features. To remove multicollinearity we can remove features one by one till all the vif values lie below 10.


In [None]:
X_vif=X_std.copy()
while vif['VIF'].max()>10:
    a=vif[vif['VIF']==vif['VIF'].max()].iloc[0,1]
    X_vif=X_vif.drop(a,axis=1)
    
    vif=pd.DataFrame()
    vif['VIF']=[variance_inflation_factor(X_vif.values,i) for i in range(X_vif.shape[1])]
    vif['feature']=X_vif.columns
vif

# We can see that one columns have been removed which brings all the vif values below 10.

### 2.Linearity

In [None]:
sns.regplot(y,model.predict(),line_kws={'color':'red'})
plt.show()

It shows a linear trend but to confirm we further check using rainbow test.

In [None]:
from statsmodels.stats.diagnostic import linear_rainbow
linear_rainbow(res=model,frac=0.5) 
# Since pvalue > 0.05 we conclude that the data is  linear.

#### 3. Normality

In [None]:
from scipy.stats import norm
sns.distplot(model.resid,fit=norm)
norm.fit(model.resid)
plt.show()

* The light blue is the actual distribution of residuals and the dark line, if the residual is perfectly normal.  
* The graph shows that the residuals does not follow a normal curve. 
* We can do transformations to make it normal.

In [None]:
import scipy.stats as stats
stats.shapiro(model.resid)
# p value = 0 < 0.05 hence we reject null hypothesis (ie.It is normally distributed) Which means that it is not normally distributed.

In [None]:
#### QQ plot ( quantile quantile plot)

import scipy.stats as stats
stats.probplot(model.resid,plot=plt)
plt.show()

# here only the extreme values are going from normality.

#### 4. Autocorelation
  
From the model Durbin-watson = 1.795 its very close to 2. Hence we can say that there is very low/ negligible autocorelation. 

In [None]:
import statsmodels.tsa.api as smt
acf = smt.graphics.plot_acf(model.resid, lags=40 , alpha=0.05)
acf.show()


ACF: The correlation between the observation at the current time spot and the observations at previous time spots
    
The blue shade is the threshold, autocorrelation is large for lag 0 and for others it is less.
    

#### 5. Homoscadacity : test of variance

In [None]:
sns.residplot(model.predict(Xc),model.resid,lowess =True, line_kws ={'color':'red'} )
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()
# from the graph it is hetro ( since there is high varience in the output )
# we can further check using 

In [None]:
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name=['F-stat','p=value']
test=sms.het_goldfeldquandt(y=model.resid,x=Xc)
lzip(name, test)

H0: Residuals got constant variance

H1: Residuals varience is not constant

Since p-value > 0.05.  We fail to reject the null hypothesis and conclude that variance of residuals is constant. Hence hetroscadastic is not present.


### Basic Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std,y,test_size=0.3,random_state=0)

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error
print('R^2 on the test data', r2_score(y_test, y_pred))


### Regularisation

### Lasso 

In [None]:
from sklearn.linear_model import Lasso, Ridge

In [None]:
lasso=Lasso(alpha=0.01)
lasso.fit(X_train,y_train)

In [None]:
pd.DataFrame(lasso.coef_,index=X_train.columns,columns=['coefs'])

In [None]:
y_pred = lasso.predict(X_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

### Ridge

In [None]:
ridge=Ridge(alpha=0.01)
ridge.fit(X_train,y_train)

In [None]:
pd.DataFrame(ridge.coef_,index=X_train.columns,columns=['coefs'])

In [None]:
y_pred = ridge.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
#### There is no much difference in the score. This is because lasso and ridge needs many columns to make a difference in the prediction. Hence we try using all the columns
cars = pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")
cars=pd.get_dummies(cars,drop_first=True)

In [None]:
X=cars.drop('Selling_Price',axis=1)
y=cars['Selling_Price']

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_std = ss.fit_transform(X)
X_std=pd.DataFrame(X_std, columns=X.columns)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std,y,test_size=0.3,random_state=0)

In [None]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import r2_score


lasso=Lasso(alpha=0.01)
lasso.fit(X_train,y_train)

pd.DataFrame(lasso.coef_,index=X_train.columns,columns=['coefs'])

y_pred = lasso.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
ridge=Ridge(alpha=0.01)
ridge.fit(X_train,y_train)

pd.DataFrame(ridge.coef_,index=X_train.columns,columns=['coefs'])

y_pred = ridge.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
# In lasso the scores have improved to 0.88 as compared to basic model with score 0.85

#### We will use some other model and compare the results

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

In [None]:
models = {'Lasso': Lasso(alpha=0.01),
          'Ridge':Ridge(alpha=0.01),
          'RandomForest' : RandomForestRegressor(),
          'DecisionTree' : DecisionTreeRegressor(),
          'GradientBoosting' : GradientBoostingRegressor(),
          'AdaBoost' : AdaBoostRegressor()}


def Different_model_scores(models):
    model_scores = {}    
    for name, model in models.items():        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_scores[name]=r2_score(y_test,y_pred)
    return model_scores
model_scores = Different_model_scores(models)
model_scores

* It can be observed that the best model is when gradient boosting is used with a R2 score of 0.90