In [None]:
import pandas as pd 
import numpy as np 
import matplotlib
import matplotlib.pyplot as plt 
import os 
import seaborn as sns 
sns.set_style("darkgrid")

%matplotlib inline 

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

root_dir  = os.path.abspath('.')
print(root_dir)
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

train  = '/kaggle/input/vehicle-dataset-from-cardekho/car data.csv'

In [None]:
train_df = pd.read_csv(train)
train_df.head()

In [None]:
train_df.shape

In [None]:
# CATEGORICAL FEATURES 
#FUEL_TYPE, SELLER-TYPE,TRANSMISSION
print(train_df['Fuel_Type'].unique())
print(train_df['Seller_Type'].unique())
print(train_df['Transmission'].unique())
print(train_df['Owner'].unique())

In [None]:
#Missing and Null values
train_df.isnull().sum()/len(train_df)

In [None]:
train_df.describe()

In [None]:
train_df.columns

In [None]:
final_dataset = train_df[[ 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

final_dataset.head()

In [None]:
#Adding new feature
from datetime import datetime
now = datetime.now()
final_dataset['Current_Year'] = now.year
final_dataset['Current_Year'].head()

In [None]:
final_dataset['age_of_car'] = final_dataset['Current_Year'] - final_dataset['Year']


In [None]:
final_dataset.head()

In [None]:
#Since we have derived Age of the car, we can now drop the variables year and current_year. 
final_dataset.drop(columns=['Year','Current_Year'], axis=1, inplace=True)

In [None]:
final_dataset = pd.get_dummies(final_dataset, drop_first=True)

In [None]:
final_dataset.head()

In [None]:
final_dataset.corr()

In [None]:
plt.figure(figsize=(20,15))
sns.pairplot(final_dataset)
plt.show()

In [None]:
corrmat = final_dataset.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(15,10))
sns.heatmap(final_dataset[top_corr_features].corr(), annot=True, cmap='RdYlGn')
plt.show()

### Dark red shows that the features are negatively correlated and dark green showns that the features are highly correlated.

In [None]:
X = final_dataset.iloc[:, 1:]
y = final_dataset.iloc[:,0]

In [None]:
### Feature Importance
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

# CREATING A BASE MODEL 

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

import pickle 

In [None]:

X_train,X_test,y_train, y_test = train_test_split(X,y, test_size =.2, random_state = 42)

In [None]:
X_train.shape

In [None]:
X_train.head()

In [None]:
#cross validation
CV=5

#variable to capture r2 scores 
R2=[]

#variable to capture cross val metrics of the model 
MODELCV=[]

MODELALGO=[]

def buildModel(model_Algorithm, trainX, trainY, testX, testY, model_name=None):
    MODELALGO.append(model_name)
    model_Algorithm.fit(trainX, trainY)
    predictions = model_Algorithm.predict(testX)
    r2 = r2_score(testY,predictions)
    R2.append(r2)
    
    cross_val=cross_val_score(model_Algorithm,trainX,trainY,cv=CV)
    MODELCV.append(cross_val.mean())
    
    print(model_Algorithm,":\n") 
    print("r_2 score :",r2,"\n")
    print("CV scores:",cross_val,"\n")
    print("CV scores mean:",cross_val.mean())
    
    test_index = y_test.reset_index()['Selling_Price']
    pred = pd.DataFrame(predictions, columns=['Selling_Price_Pred']).reset_index()['Selling_Price_Pred']
    combined = pd.concat([test_index, pred], axis=1)
    
    print(combined.head())
    
    #open a file to store the model
    pklFile = open(model_name+'.pkl','wb')
    
    #dumping model information to the file
    pickle.dump(model_Algorithm, pklFile)
    
    return combined

# SIMPLE LINEAR REGRESSION

In [None]:
linModel = LinearRegression(fit_intercept=True)
model = buildModel(linModel, X_train, y_train, X_test, y_test, 'LinearRegression')

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(data=model)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x = model['Selling_Price'], y=model['Selling_Price_Pred'])
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.distplot(model['Selling_Price']-model['Selling_Price_Pred'])
plt.show()

# RIDGE REGRESSION

In [None]:
#Tuning Alpha for RIDGE
paramGrid = {
    "alpha": np.logspace(np.log10(0.02) , np.log10(2.0) , num=20),
    "normalize":[True, False],
}

ridgeGrid = GridSearchCV(estimator=Ridge(), param_grid=paramGrid, cv=5)
ridgeGrid.fit(X_train, y_train)

print(ridgeGrid.best_score_)
print(ridgeGrid.best_estimator_)


In [None]:
ridge = Ridge(**ridgeGrid.best_params_)
model = buildModel(ridge, X_train, y_train, X_test, y_test,'RidgeRegression')

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(data=model)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x = model['Selling_Price'], y=model['Selling_Price_Pred'])
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.distplot(model['Selling_Price']-model['Selling_Price_Pred'])
plt.show()

# LASSO REGRESSION

In [None]:
#Tuning Alpha for RIDGE
paramGrid = {
    "alpha": np.logspace(np.log10(0.02) , np.log10(2.0) , num=20),
    "normalize":[True, False],
}

lasGrid = GridSearchCV(estimator=Lasso(), param_grid=paramGrid, cv=5)
lasGrid.fit(X_train, y_train)

print(lasGrid.best_score_)
print(lasGrid.best_estimator_)

In [None]:
lasso = Lasso(**lasGrid.best_params_)
model = buildModel(lasso, X_train, y_train, X_test, y_test,'LassoRegression')

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(data=model)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x = model['Selling_Price'], y=model['Selling_Price_Pred'])
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.distplot(model['Selling_Price']-model['Selling_Price_Pred'])
plt.show()

# RANDOM FOREST REGRESSOR

In [None]:
###HYPERPARAMETERS FOR RANDOMFORESTREGRESSOR
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(5,30, num=6)]
min_samples_split = [2,5,10,15,100]
min_samples_leaf = [1,2,5,10]

In [None]:
param_grid={
    "n_estimators":n_estimators,
    "max_features":max_features,
    "max_depth":max_depth,
    "min_samples_split":min_samples_split,
    "min_samples_leaf":min_samples_leaf,
}
print(param_grid)

In [None]:
rf_random = RandomizedSearchCV(estimator=RandomForestRegressor(), 
                               param_distributions=param_grid, 
                               scoring = 'r2',
                               n_iter=10,
                               cv=5,
                               random_state=42,
                               n_jobs=1,
                               verbose=1,
                              )

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
predictions = rf_random.predict(X_test)
predictions

In [None]:
rf = RandomForestRegressor(**rf_random.best_params_)
model = buildModel(rf, X_train, y_train, X_test, y_test, 'RandomForestRegression')

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(data=model)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x = model['Selling_Price'], y=model['Selling_Price_Pred'])
plt.show()

##### The scatter plot above is showing a linear pattern between y_test and predicted values. this also suggests that the values predicted by the model is good. 

In [None]:
plt.figure(figsize=(15,10))
sns.distplot(model['Selling_Price']-model['Selling_Price_Pred'])
plt.show()

#### from  above the residual graph looks like a normal curve. this basically suggests that the model we created is a good model. 


In [None]:
results=pd.DataFrame({'Model': MODELALGO,'R Squared': R2,'CV score mean': MODELCV})
results

# Testing all the above built models 

In [None]:
model_RF = pickle.load(open('RandomForestRegression.pkl', 'rb'))
model_LIN = pickle.load(open('LinearRegression.pkl', 'rb'))
model_RDG = pickle.load(open('RidgeRegression.pkl', 'rb'))
model_LAS = pickle.load(open('LassoRegression.pkl', 'rb'))

In [None]:
Present_Price = 9.54
Kms_Driven = 43000
Owner = 0
age_of_car = datetime.now().year - 2014
Fuel_Type_Diesel = 1
Fuel_Type_Petrol =0 
Seller_Type_Individual = 1
Transmission_Manual = 1

In [None]:
prediction_RF = model_RF.predict([[Present_Price, Kms_Driven, Owner, age_of_car, Fuel_Type_Diesel, Fuel_Type_Petrol,
                                     Seller_Type_Individual, Transmission_Manual]])

prediction_LIN = model_LIN.predict([[Present_Price, Kms_Driven, Owner, age_of_car, Fuel_Type_Diesel, Fuel_Type_Petrol,
                                   Seller_Type_Individual, Transmission_Manual]])

prediction_RDG = model_RDG.predict([[Present_Price, Kms_Driven, Owner, age_of_car, Fuel_Type_Diesel, Fuel_Type_Petrol,
                                     Seller_Type_Individual, Transmission_Manual]])

prediction_LAS = model_LAS.predict([[Present_Price, Kms_Driven, Owner, age_of_car, Fuel_Type_Diesel, Fuel_Type_Petrol,
                                     Seller_Type_Individual, Transmission_Manual]])

output_RF = round(prediction_RF[0], 2)
output_LIN = round(prediction_LIN[0], 2)
output_RDG = round(prediction_RDG[0], 2)
output_LAS = round(prediction_LAS[0], 2)

In [None]:
predictions = 'PREDICTIONS RandomForest: ' + str(output_RF) + ' Linear Regression: ' + str(output_LIN) 
predictions = predictions + ' Ridge Regression: ' + str(output_RDG) + ' Lasso Regression: ' + str(output_LAS)

print(predictions)