# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Reading the Data

In [None]:
cars = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')
cars

# Visualize

In [None]:
sns.pairplot(cars)
plt.show()

In [None]:
plt.figure(figsize = (20,12))

plt.subplot(2,2,1)
sns.boxplot(x = 'Fuel_Type', y = 'Selling_Price', data = cars)

plt.subplot(2,2,2)
sns.boxplot(x = 'Seller_Type', y = 'Selling_Price', data = cars)

plt.subplot(2,2,3)
sns.boxplot(x = 'Transmission', y = 'Selling_Price', data = cars)

plt.subplot(2,2,4)
sns.boxplot(x = 'Car_Name', y = 'Selling_Price', data = cars)
plt.show()

In [None]:
cars.info()

In [None]:
cars.describe()

In [None]:
cars['Car_Name'].value_counts()

# Pre Process Data

In [None]:
# drop car_name column
cars_processed = cars.drop('Car_Name',axis = 1)
cars_processed

In [None]:
cars_processed['Year'].value_counts()

In [None]:
cars_processed['Transmission'].value_counts()

In [None]:
cars_processed['Fuel_Type'].value_counts()

In [None]:
cars_processed['Seller_Type'].value_counts()

Encoding
1. Fuel_Type: Dummy vars - Petrol, Diesel, CNG
2. Seller_Type: Dummy vars - Dealer, Individual
3. Transmission: Dummy vars - Manual, Automatic

In [None]:
#create dummy vars
dum_vars = ['Fuel_Type','Seller_Type','Transmission']

status = pd.get_dummies(cars_processed[dum_vars])
status

In [None]:
# removed redundant vars
status = pd.get_dummies(cars_processed[dum_vars], drop_first = True)
status

In [None]:
# concate dummy vars to dataset
cars_encoded = pd.concat([cars_processed, status], axis = 1)
cars_encoded

In [None]:
# remove unnecessary columns
rm_vars = ['Fuel_Type','Seller_Type','Transmission']
cars_encoded = cars_encoded.drop(rm_vars, axis = 1)
cars_encoded

# Splitting into train and test

In [None]:
df_train, df_test = train_test_split(cars_encoded, train_size = 0.7, random_state = 100)
print(df_train.shape)
print(df_test.shape)

# Rescaling the train set

In [None]:
scaler = MinMaxScaler()

# numeric vars
num_vars = ['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven']

df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train

In [None]:
df_train.describe()

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(df_train.corr(), annot = True)
plt.show()

# Training the Model

In [None]:
# create X and y
y_train = df_train.pop('Selling_Price')
X_train = df_train

In [None]:
# adding the constant
X_train_sm = sm.add_constant(X_train)

# model object
lr = sm.OLS(y_train, X_train_sm)

# fit the model
lr_model = lr.fit()

# show summary
lr_model.summary()

### VIF Calc.

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

VIF > 10
 - Definitely Remove
     1. Fuel_Type_Petrol
     2. Year

In [None]:
# rebuild removing fuel_type_petrol
X = X_train.drop('Fuel_Type_Petrol', axis = 1)
# adding the constant
X_train_sm = sm.add_constant(X)

# model object
lr = sm.OLS(y_train, X_train_sm)

# fit the model
lr_model = lr.fit()

# show summary
lr_model.summary()

In [None]:
vif = pd.DataFrame()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

In [None]:
# rebuild removing year
X = X_train.drop('Fuel_Type_Petrol', axis = 1)
# adding the constant
X_train_sm = sm.add_constant(X)

# model object
lr = sm.OLS(y_train, X_train_sm)

# fit the model
lr_model = lr.fit()

# show summary
lr_model.summary()

In [None]:
vif = pd.DataFrame()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif


# Residual Analysis

In [None]:
# get predicted values
y_train_pred = lr_model.predict(X_train_sm)
res = y_train - y_train_pred

sns.distplot(res)

# Prediction and Evaluation
- Pre Process Test Set

In [None]:
df_test

In [None]:
# list of numeric vars
num_vars = ['Year','Selling_Price','Present_Price','Kms_Driven']

# Rescale data based on train data scaler
df_test[num_vars] = scaler.transform(df_test[num_vars])
df_test

In [None]:
df_test.describe()

In [None]:
# X , y
y_test = df_test.pop('Selling_Price')
X_test = df_test

In [None]:
X_test = X_test.drop('Fuel_Type_Petrol', axis = 1)
X_test_sm = sm.add_constant(X_test)
X_test_sm

In [None]:
# predict based on model
y_test_pred = lr_model.predict(X_test_sm)

# Evaluate

In [None]:
# Check R-Squared
print(r2_score(y_true = y_train, y_pred = y_train_pred))
print(r2_score(y_true = y_test, y_pred = y_test_pred))

In [None]:
# Compute and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

In [None]:
rmse