In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.preprocessing import LabelEncoder

from sklearn import model_selection

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR   ##Support Vector Regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt



import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv(r'SecondCar.csv')

In [None]:
data.head()

# Missing value treatment

In [None]:
#######Check for missing values
data.info()

In [None]:
for col in data:
    
    if data[col].dtype=='float64' or data[col].dtype=='int64' or data[col].dtype=='int32' or data[col].dtype=='float32':
        median = data[col].median()
        data[col].fillna(median, inplace=True)
    else:
        data = data.fillna(data.mode().iloc[0])

In [None]:
##Check data after missing value treatment

In [None]:
data.info()

# unique values in each column

In [None]:
data.nunique(dropna=True)

In [None]:
####Save unique value count into dataframe 
unique_count = pd.DataFrame(data.nunique(dropna=True))
###Create a new column for total value & percentage
unique_count['Total_Value_Count'] = len(data)

In [None]:
unique_count

In [None]:
unique_count['Percent'] = unique_count[0]/unique_count['Total_Value_Count']*100

# Feature Engineering

In [None]:
data['Company_Name']= data['name'].str.split(' ').str[0]

data.head(10)

In [None]:
data.nunique(dropna=True)

# Column Type Reidentification : replace category by numerical

In [None]:
data['fuel'].unique()

In [None]:
lb_make = LabelEncoder()
data["fuel_Copy"] = lb_make.fit_transform(data["fuel"])
data[["fuel", "fuel_Copy"]].head(11)

In [None]:
#Change all categorical columns to numerical
data["fuel"] = lb_make.fit_transform(data["fuel"])
data["Company_Name"] = lb_make.fit_transform(data["Company_Name"])
data["seller_type"] = lb_make.fit_transform(data["seller_type"])
data["transmission"] = lb_make.fit_transform(data["transmission"])
data["owner"] = lb_make.fit_transform(data["owner"])

In [None]:
data.head(2)

## Drop remaining categorical columns

In [None]:
data.drop('name', axis=1, inplace=True)
data.head(2)

# Identify X & Y

In [None]:
#Independent Variable
X = data.drop('selling_price',1)
#Dependent variable
y = data['selling_price']

In [None]:
X.head(2)

In [None]:
y.head(10)

In [None]:
##Correlation between columns

#Correlation is between -1 to 1

In [None]:
corr = X.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
#Drop fuel_copy based on correlation

X.drop('fuel_Copy', axis=1, inplace=True)

X.head(2)

# Feature Scaling

In [None]:
X.describe()

### Converting max value to 1

In [None]:
all_X = list(X.columns)
X[all_X] = X[all_X]/X[all_X].max()

X.describe()

# Conclude data preprocessing

# Spliting data into 4 parts

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
print(X_train.shape); print(X_test.shape);print(y_train.shape); print(y_test.shape)

### Linear Regression

In [None]:
lr = LinearRegression()  #declare &assign model name
lr.fit(X_train, y_train)

In [None]:
pred_train_lr= lr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_lr)))
print(r2_score(y_train, pred_train_lr))

pred_test_lr= lr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lr)))   ##y_test == actual 30% value 
print(r2_score(y_test, pred_test_lr))

In [None]:
# get importance
importance = lr.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('X_Column: %0d, Score: %.5f' % (i,v))

# final model equation::
Selling_price = 35965154.21 * year + 13615.71 * km_driven -68778.96 * fuel -31302.42 * seller_type -92138.19 * transmission
                + 81.20 * owner -96371.23 * Rating + 9119096.01 * ExShowroom Price -21363.75 * company_name



# Root mean square error
error-- ex. errors = (2,-2, 3, 4 , -3,-4) then total error == 0   
square error -- (4,4,9,16,9,16) -- total error = 58   
mean square error -- 58/6 = 9.667  
root mean square error -- 3.10  

# r^2 --- proporation of y explained by all X

In [None]:
X.head()

### Ridge Regression

In [None]:
rr = Ridge(alpha=0.01)
rr.fit(X_train, y_train) 
pred_train_rr= rr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rr)))
print(r2_score(y_train, pred_train_rr))

pred_test_rr= rr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
print(r2_score(y_test, pred_test_rr))

In [None]:
# get importance
importance = rr.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

### Lasso Regression

In [None]:
model_lasso = Lasso(alpha=0.2)
model_lasso.fit(X_train, y_train) 
pred_train_lasso= model_lasso.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
print(r2_score(y_train, pred_train_lasso))

pred_test_lasso= model_lasso.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 
print(r2_score(y_test, pred_test_lasso))

In [None]:
# get importance
importance = model_lasso.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

### ElasticNet Regression
####ElasticNet combines the properties of both Ridge and Lasso regression

In [None]:
model_enet = ElasticNet(alpha = 0.0001)
model_enet.fit(X_train, y_train) 
pred_train_enet= model_enet.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_enet)))
print(r2_score(y_train, pred_train_enet))

pred_test_enet= model_enet.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_enet)))
print(r2_score(y_test, pred_test_enet))

In [None]:
# get importance
importance = model_enet.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

### KNeighbors Regression
based upon n nearest identical set of values

In [None]:
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, y_train)
pred_train_neigh= neigh.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_neigh)))
print(r2_score(y_train, pred_train_neigh))

pred_test_neigh= neigh.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_neigh)))
print(r2_score(y_test, pred_test_neigh))

#### 'KNeighborsRegressor' object has no attribute 'coef_'

### Support vector regression

In [None]:
regr = SVR(C=1.0, epsilon=0.3,gamma='auto')
regr.fit(X_train, y_train)
pred_train_regr= regr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_regr)))
print(r2_score(y_train, pred_train_neigh))

pred_test_regr= regr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_regr)))
print(r2_score(y_test, pred_test_regr))

# Compare:
1. Test Accuracy -- select most accurate model
2. In case of tie in accuaracy number -- check for balanced feature(column) importance