In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder
from math import sqrt

import pickle

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('SecondCar.csv')
data.head(2)

## Data Preprocessing

In [None]:
data.info()

In [None]:
data[data.isnull().any(axis = 1)]

In [None]:
for col in data:
    
    if data[col].dtype=='float64' or data[col].dtype=='int64' or data[col].dtype=='int32' or data[col].dtype=='float32':
        
        data[col] = data[col].fillna(data[col].median())
        
    else:
        
        data[col] = data[col].fillna(data[col].mode().iloc[0])
        
data.info()

In [None]:
unique_count = pd.DataFrame(data.nunique())
unique_count

In [None]:
unique_count['TotalCount'] = len(data)
unique_count.rename(columns = {0:'unique_value'}, inplace = True)
unique_count

In [None]:
unique_count['percentage'] = unique_count.unique_value / unique_count.TotalCount *100
unique_count

### 1. If data type is categorical & unique % == 100% then delete the column

### 2. If data type categorical & percentage is greater than 1 percent then try for feature engineering

In [None]:
data.info()

In [None]:
unique_count

In [None]:
data.head(2)

### Feature Engineering

In [None]:
data['company_name'] = data['name'].str.split(' ').str[0]
data.head()

In [None]:
print(list(data.company_name.unique()))

In [None]:
data.company_name.unique()

In [None]:
len(data.company_name.unique())

In [None]:
data = data.drop('name', axis = 1)
data.head(2)

### Categorical variable encoding

In [None]:
lb_make = LabelEncoder()

In [None]:
data['fuel_copy'] = lb_make.fit_transform(data['fuel'])
data.head(10)

In [None]:
data['fuel'] = lb_make.fit_transform(data['fuel'])
data['seller_type'] = lb_make.fit_transform(data['seller_type'])
data['transmission'] = lb_make.fit_transform(data['transmission'])
data['owner'] = lb_make.fit_transform(data['owner'])
data['company_name'] = lb_make.fit_transform(data['company_name'])
data.head(10)

In [None]:
data.to_csv('Reg.csv') #to export the csv file 

### Identify X & Y

In [None]:
y = data['selling_price']
X = data.drop('selling_price', axis = 1)

In [None]:
X.describe().transpose()

### Converting all max values to 1 by dividing each column by its own max value

#### This step is performed to offer equal opportuinity to all 'X' columns to contribute in 'Y'

In [None]:
All_X_column = list(X.columns)
print(All_X_column)

In [None]:
X[All_X_column] = X[All_X_column] / X[All_X_column].max()
X.describe().transpose()

In [None]:
X.head()

In [None]:
X.to_csv('x.csv')

### Correlation

In [None]:
corr = X.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
import seaborn as sns

matrix = np.triu(X.corr())
sns.heatmap(X.corr(), annot=True, mask=matrix)


In [None]:
X=X.drop('fuel_copy',axis=1)
X.head(2) #this ncox we delete the fuel_copy as it has corr <-0.9 to 0.9

In [None]:
#-0.9 & 0.9 > asel na tar we delete the column

In [None]:
data.info()

### concluded data preprocessing

# Splitting data into 4 Parts

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=40)
print(X_train.shape);print(X_test.shape);print(y_train.shape);print(y_test.shape)

# Linear Regression

In [None]:
lr=LinearRegression()  #declaring a model name
lr.fit(X_train,y_train)  #Creating the model

In [None]:
pred_train_lr=lr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_lr)))
print(r2_score(y_train,pred_train_lr)) 

pred_test_lr=lr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lr)))
print(r2_score(y_test,pred_test_lr))

In [None]:
#get importance
importance=lr.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('X_coulmn:%0d,Score:%.5f'%(i,v))

In [None]:
X.head() # positive values are positive impact
        #negative values are negative impact which can be ignored

# final model equation
Selling_price = 35965154.21 * year + 13615.71 * km_driven -68778.96 * fuel -31302.42 * seller_type -92138.19 * transmission

            + 81.20 * owner -96371.23 * Rating + 9119096.01 * ExShowroom Price -21363.75 * company_name


## Root mean square error
error-- ex. errors = (2,-2, 3, 4 , -3,-4) then total error == 0   
square error -- (4,4,9,16,9,16) -- total error = 58   
mean square error -- 58/6 = 9.667  
root mean square error -- 3.10 #this error should be minimum  

# Ridge Regression

In [None]:
rr=Ridge(alpha=0.01)
rr.fit(X_train,y_train)


In [None]:
pred_train_rr=rr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rr)))
print(r2_score(y_train,pred_train_rr)) 

pred_test_rr=rr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rr)))
print(r2_score(y_test,pred_test_rr))

In [None]:
#get importance
importance=rr.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('X_coulmn:%0d,Score:%.5f'%(i,v))

In [None]:
X.head()

# Lasso Regression

In [None]:
model_lasso=Lasso(alpha=0.01)
model_lasso.fit(X_train,y_train)

In [None]:
pred_train_lasso=model_lasso.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
print(r2_score(y_train,pred_train_lasso)) 

pred_test_lasso=model_lasso.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lasso)))
print(r2_score(y_test,pred_test_lasso))

In [None]:
#get importance
importance=model_lasso.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('X_coulmn:%0d,Score:%.5f'%(i,v))

In [None]:
X.head()

# ElasticNet Regression
combination of linear and lasso

In [None]:
model_enet=ElasticNet(alpha =0.0001)
model_enet.fit(X_train,y_train)

In [None]:
pred_train_enet=model_enet.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_enet)))
print(r2_score(y_train,pred_train_enet)) 

pred_test_enet=model_enet.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lasso)))
print(r2_score(y_test,pred_test_enet))

In [None]:
#get importance
importance=model_enet.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('X_coulmn:%0d,Score:%.5f'%(i,v))

In [None]:
X.head()

# KNeighbors Regression
based upon n nearest identical set of values

In [None]:
neigh=KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train,y_train)

In [None]:
pred_train_neigh=neigh.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_neigh)))
print(r2_score(y_train,pred_train_neigh)) 

pred_test_neigh=neigh.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_neigh)))
print(r2_score(y_test,pred_test_neigh))

# Support Vector Regression

In [None]:
regr=SVR(C=1.0,epsilon=0.3,gamma='auto')
regr.fit(X_train,y_train)

In [None]:
pred_train_regr=regr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_regr)))
print(r2_score(y_train,pred_train_regr)) 

pred_test_regr=regr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_regr)))
print(r2_score(y_test,pred_test_regr))

# Compare:(For best selection of method)
1. Test Accuracy -- select most accurate model
2. In case of tie in accuaracy number -- check for balanced feature(column) importance


# Weight File Creation

In [None]:
filename= 'Regression_weight_file.sav'
pickle.dump(lr,open(filename,'wb'))

In [None]:
weight_file_check_data=pd.read_csv(r'Weight_File_R.csv')
weight_file_check_data

In [None]:
#loading the weight file
filename='Regression_weight_file.sav'
model_load=pickle.load(open(filename,'rb'))
threshold=0.5
pred_test_lr=lr.predict(weight_file_check_data)
y_pred = (pred_test_lr).astype('int')
print(y_pred)