# Imports

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from scipy.stats import pointbiserialr

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Gather and Clean Data

In [None]:
data_dekho = pd.read_csv("../input/vehicle-dataset-from-cardekho/CAR DETAILS FROM CAR DEKHO.csv")
data_dekho.shape

In [None]:
data_dekho.tail() 

In [None]:
data_dekho.info()

* As you can see abowe, we have 8 columns. 
* We are lucky, we have no null or NaN values in our dataset. 
* We can start to analyze our dataset with the feature that is called <b>name</b>. 

In [None]:
data_dekho["name"]

* You can see different names for cars in the dataset. 
* But I don't want to use this feature. Because what I wanna do is that just predicting prices, but this shouldn't include their name. If I did this, I would have to be divide them. But I don't want.  

In [None]:
data_dekho.drop(['name'], axis=1, inplace=True)
data_dekho.head() 

* Analyze transmission 
* Manual = 0, Automatic = 1 

In [None]:
data_dekho['transmission'].unique()

In [None]:
transmission = data_dekho['transmission'] 
transmission_clean = [0 if i == "Manual"  else 1  for i in data_dekho['transmission']]

transmission_clean = np.array(transmission_clean)
transmission_clean.shape

In [None]:
data_dekho.drop(['transmission'], axis=1, inplace=True)
data_dekho['transmission'] = transmission_clean

* Analyze owner

In [None]:
data_dekho['owner'].unique()

In [None]:
owner_unique_names = data_dekho['owner'].unique()
owner_unique_names = pd.Series(index=owner_unique_names, data=[0,1,2,3,4])
dict(owner_unique_names)

In [None]:
data_dekho['owner'].replace(dict(owner_unique_names), inplace=True)
data_dekho.head() 

In [None]:
fuel_unique_names = data_dekho['fuel'].unique()
fuel_unique_names = pd.Series(index=fuel_unique_names, data=[0,1,2,3,4])
dict(fuel_unique_names)
data_dekho['fuel'].replace(dict(fuel_unique_names), inplace=True)
data_dekho.head() 

In [None]:
seller_type_unique_names = data_dekho['seller_type'].unique()
seller_type_unique_names = pd.Series(index=seller_type_unique_names, data=[0,1,2])
dict(seller_type_unique_names)
data_dekho['seller_type'].replace(dict(seller_type_unique_names), inplace=True)
data_dekho.head() 

In [None]:
data = data_dekho.drop('selling_price', axis=1) 
data['price'] = data_dekho['selling_price'] 

In [None]:
data.head() 

In [None]:
data.info() 

In [None]:
data.count() 

In [None]:
pd.isnull(data).any()

* Now, we have a clean dataset with 6 explanatory variables and target variables that is called price. 

# Visualising Data - Histograms, Distributions and Bar Charts

In [None]:
data.head() 

In [None]:
plt.figure(figsize=(10,6))
plt.hist(data['price'], bins=50,ec='black', color='#2196f3') 
plt.xlabel('prices', fontsize=14)
plt.ylabel('Nr of Prices', fontsize=14)
plt.title("The Distribution of The Target Variable", fontsize=14)
plt.show() 

* We have a big problem about outliers. 
* If we leave this as above, this situation manipulates our stats. 
* So, we should get rid of this. 

* Coming cells, we will be looking for BOX PLOT and IQR.

In [None]:
data['price'].skew()

In [None]:
data["price"].min()

In [None]:
data["price"].max()

In [None]:
data['price'].mean() 

# Box Plot

In [None]:
plt.figure(figsize=(10,7))

sns.boxplot(x=data['price'])
plt.xlabel("price",fontsize=14)
plt.show()

In [None]:
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
res = (data['price'] < (Q1 - 1.5 * IQR)) | (data['price'] > (Q3 + 1.5 * IQR))
print(res[res.values == True].count(), "outliers")

* In this case, we have two way we can try 
* a. Data Transformation 
* b. Removing Outliers

## a. Data Transformation 

In [None]:
log_prices = np.log(data['price'])
data_log_prices = data.drop(['price'], axis=1)
data_log_prices['price'] = log_prices

In [None]:
plt.figure(figsize=(10,6))
plt.hist(data_log_prices['price'], bins=50,ec='black', color='#2196f3') 
plt.xlabel('prices', fontsize=14)
plt.ylabel('Nr of Prices', fontsize=14)
plt.title(f"The Distribution of The Target Variable skew:{str(round(data_log_prices['price'].skew(),3))}", fontsize=14)
plt.show() 

## b. Removing Outliers 

In [None]:
data.iloc[12,:] # example of an outlier

In [None]:
out_idx = res[res.values == True].index
data_rem_out = data.drop(index=out_idx)
data_rem_out.shape

In [None]:
plt.figure(figsize=(10,6))
plt.hist(data_rem_out['price'], bins=50,ec='black', color='#2196f3') 
plt.xlabel('prices', fontsize=14)
plt.ylabel('Nr of Prices', fontsize=14)
plt.title(f"The Distribution of The Target Variable skew:{str(round(data_rem_out['price'].skew(),3))}", fontsize=14)
plt.show() 

* We have a lot more normal disribution with log prices, 
* We are going to use like this. 

In [None]:
data = data_log_prices

In [None]:
plt.figure(figsize=(10,6))
freq = data['year'].value_counts()
plt.bar(x=freq.index, height=freq.values)
plt.xlabel('years', fontsize=14)
plt.ylabel('Nr of Years', fontsize=14)
plt.title(f"The Distribution of The Year Variable", fontsize=14)
plt.show()

In [None]:
data['year'].min() 

In [None]:
data['year'].max() 

In [None]:
data.loc[data['year'] < 2000, 'price'] .mean() 

In [None]:
data.loc[data['year'] > 2000, 'price'] .mean() 

In [None]:
plt.figure(figsize=(10,6))
plt.hist(data['km_driven'], bins=50,ec='black', color='#2196f3') 
plt.xlabel('prices', fontsize=14)
plt.ylabel('Nr of Prices', fontsize=14)
plt.title("The Distribution of The Target Variable", fontsize=14)
plt.show() 

In [None]:
data['km_driven'].mean() 

In [None]:
data['km_driven'].min() 

In [None]:
data['km_driven'].max() 

In [None]:
plt.bar(x=fuel_unique_names.index, height=data['fuel'].value_counts())
plt.show() 

In [None]:
round(data.describe())

# Correlation 

* We care about two things, 
    a. Strength 
    b. Direction 
* Actually, We want the correlation that is not close to zero with target variable. 
* Also, the correlation among features shouldn't be too high, if it is, we can suspect "MULTICOLLINEARRITY"

* For now, let's begin with <b>Correlation</b>

## $$ \rho _{XY} = corr(X,Y)$$
## $$ -1.0 \leq \rho _{XY} \leq +1.0 $$

In [None]:
data.corr() # Pearson Correlation Coefficients

In [None]:
mask = np.zeros_like(data.corr())
triangle_indices = np.triu_indices_from(mask)
mask[triangle_indices] = True
mask

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(), mask=mask, annot=True, annot_kws={"size": 14})
sns.set_style('white')
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.show()

* We should take the note, now we are using Pearson Correlation.
* Pearson's correlation coefficient is a measure of the strength of the association between the two variables but it works with countinuos variables.

* So, you can see above, we have six features but all of them are not countinuos. 
* There are just two countinuos features "year" and "km_driven". So that we can analyze them with Pearson correlation, actually default in pandas corr function. 

In [None]:
cor = round(data['price'].corr(data['km_driven']),3) 
sns.lmplot(x="km_driven", y="price", data=data, height=6, 
           line_kws={'color': 'cyan'}, scatter_kws={'color': 'purple', 'alpha': 0.7})
plt.title(f'price vs km corr:{cor}', fontsize=14)
plt.show() 

* km_driven feature has a low corr with target. 
* We have to fix this. 
* We can miss a explanatory variable or something alse, 
* let's have a look at distribution of the km_driven

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(data['km_driven'])
plt.title(f"The Histogram of the km_driven skew:{round(data['km_driven'].skew(),3)}")
plt.show()

In [None]:
data['price'].corr(np.log(data['km_driven']))

In [None]:
km_log = np.log(data['km_driven'])
data['km_driven'] = km_log
data.head() 

In [None]:
cor = round(data['price'].corr(data['km_driven']),3) 
sns.lmplot(x="km_driven", y="price", data=data, height=6, 
           line_kws={'color': 'cyan'}, scatter_kws={'color': 'purple', 'alpha': 0.7})
plt.title(f'price vs km corr:{cor}', fontsize=14)
plt.show() 

In [None]:
plt.figure(figsize=(10, 6), dpi=300)
plt.scatter(data['year'], data['km_driven'], color='indigo', s=80, alpha=0.7)
plt.title(f"Year vs Km_Driven Corr: {round(data['year'].corr(data['km_driven']),3)}")
plt.xlabel('Year', fontsize=14)
plt.ylabel('Km_Driven', fontsize=14)          
plt.show() 

In [None]:
%%time

sns.pairplot(data)
plt.show()

# Training & Test Dataset Split

In [None]:
log_target = data['price']
features = data.drop(['price'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, log_target, test_size=0.2)

# Multivariable Regression 

In [None]:
# Model using log price and log km_driven 
regr = LinearRegression() 
model_log_price_km = regr.fit(X_train, y_train)

log_price_log_km = regr.score(X_train, y_train)


print('Intercept is', round(regr.intercept_,3))
print('R-squared for training set is', regr.score(X_train, y_train))
print('R-squared for testing set is', regr.score(X_test, y_test))

pd.DataFrame(regr.coef_, columns=['coef'], index=features.columns)

In [None]:
target = np.e**data['price']
features = data.drop(['price'], axis=1)
features['km_driven'] = np.e**data['km_driven']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

In [None]:
# Model using normal price and normal km_driven 
regr = LinearRegression() 
regr.fit(X_train, y_train)

norm_price_norm_km = regr.score(X_train, y_train)

print('Intercept is', round(regr.intercept_,3))
print('R-squared for training set is', regr.score(X_train, y_train))
print('R-squared for testing set is', regr.score(X_test, y_test))

pd.DataFrame(regr.coef_, columns=['coef'], index=features.columns)

In [None]:
# Model using log price and norm km_driven 

target = data['price']
features = data.drop(['price'], axis=1)
features['km_driven'] = np.e**data['km_driven']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)


regr = LinearRegression() 
regr.fit(X_train, y_train)

log_price_norm_km = regr.score(X_train, y_train)

print('Intercept is', round(regr.intercept_,3))
print('R-squared for training set is', regr.score(X_train, y_train))
print('R-squared for testing set is', regr.score(X_test, y_test))

pd.DataFrame(regr.coef_, columns=['coef'], index=features.columns)

# Model Evaluation 

  ### a. R-Squared

In [None]:
arr = np.asanyarray([log_price_log_km, log_price_norm_km, norm_price_norm_km])

pd.DataFrame(arr, columns=['R-Squared'], index=['LOG PRICE AND LOG KM', 'LOG PRICE AND NORMAL KM', 'NORMAL PRICE AND NORMAL KM'])

## b. P-Value

In [None]:
X_incl_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_incl_const) 
results = model.fit() 
round(results.pvalues, 3)

In [None]:
log_km_x_train = X_train
log_km_x_train['km_driven'] = np.log(X_train['km_driven'])
X_incl_const_log_km = sm.add_constant(log_km_x_train)
model = sm.OLS(y_train, X_incl_const_log_km) 
results = model.fit() 


In [None]:
round(results.pvalues, 3)

* You can see abowe, km_driven has a p_value that is equal to 0.049. Because of this, It doesn't look significance statistically. 
* We tried data transformation for it. We used log function, but in this case it increased.

# c. Multicollinearity

* Actually, We didn't suspect multicollinearity because of our correlatin table. 
* If two or more variable were higly related to one another, they wouln't provide unique or independent information for our model. 
* But we're gonna look at it with VIF. 
* VIF(Variation Inflation Factor). 

In [None]:
variance_inflation_factor(exog=np.asanyarray(X_incl_const_log_km), exog_idx=1)

In [None]:
vifs = [variance_inflation_factor(exog=np.asanyarray(X_incl_const_log_km), exog_idx=i) 
        for i in range(len(X_incl_const.columns))]
pd.DataFrame(np.asanyarray(vifs).reshape(1,7),  columns=X_incl_const.columns, index=['VIF'])

# d. Model Simplification & the BIC

In [None]:
# Model using log price and norm km_driven 

X_incl_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_incl_const)
results = model.fit() 

print("R-squared is", results.rsquared)
print("BIC is", results.bic)

In [None]:
# Model using log price without km_driven 

X_incl_const = sm.add_constant(X_train.drop(['km_driven'], axis=1))
model = sm.OLS(y_train, X_incl_const)
results = model.fit() 

print("R-squared is", results.rsquared)
print("BIC is", results.bic)

# Residual and Residual Plots

* We'll analyze the residual vs predicted values and the dist of the residual. The residual vs predicted shouldn't have any pattern. If you catch any pattern in scatter plot, may be you are missig an explanotory variable, or something else is wrong for your model. 

* And also for a good linear regression model, you should have normally distributed residual. It's important. Because you can see how it is working your model. 

In [None]:
# Predicted log prices vs Actual Log prices 

regr = LinearRegression().fit(X_train, y_train) 

predicted_values = pd.Series(regr.predict(X_train))
corr = np.round(y_train.corr(predicted_values), 3)

plt.figure(figsize=(10,6))

plt.scatter(x=predicted_values, y=y_train)
plt.plot(y_train, y_train, c='red')
plt.title(f"Predicted log prices vs Actual Log prices {corr}", fontsize=14)
plt.xlabel('Predicted Price',fontsize=14)
plt.ylabel('Actual Price', fontsize=14) 


# residual vs predicted values 
plt.figure(figsize=(10,6))
y = np.asanyarray(y_train)
y_hat = np.asanyarray(predicted_values)
resi = y - y_hat

plt.scatter(x=predicted_values, y=resi, c="skyblue",alpha=0.7)
plt.xlabel('Residual', fontsize=14)
plt.ylabel('Predicted Values', fontsize=14)
plt.title("Residual vs Predicted Values", fontsize=14)

plt.figure(figsize=(10,6))
sns.distplot(resi)
plt.title(f'The Distribution of the Residuals Skew:{round(pd.Series(resi).skew(), 2)}', fontsize=14)

plt.show() 

In [None]:
print("R-squared is", regr.score(X_train, y_train))