In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
final_dataset=df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

In [None]:
final_dataset.head()

In [None]:
final_dataset['Current_Year']=2020
final_dataset['no_year']=final_dataset['Current_Year']-final_dataset['Year']
final_dataset.drop(['Year', 'Current_Year'], axis=1, inplace=True)

In [None]:
final_dataset.head()

## Converting categorical features to one hot encoded

In [None]:
final_dataset=pd.get_dummies(final_dataset, drop_first=True)

In [None]:
final_dataset.head()

## Correlation Matrix

In [None]:
corr_matrix = final_dataset.corr()
corr_matrix

## Plot Visualizations

In [None]:
import seaborn as sns
sns.pairplot(final_dataset)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
corrmat=final_dataset.corr()
top_corr_features=corrmat.index
plt.figure(figsize=(20,20))
# plot heat map
g=sns.heatmap(final_dataset[top_corr_features].corr(), annot=True, cmap="RdYlGn")

## Train-Test Splitting

In [None]:
from sklearn.model_selection import train_test_split
X = final_dataset.iloc[:,1:]
y = final_dataset.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X.head()

In [None]:
y.head()

## Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model=ExtraTreesRegressor()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
#plot graph of feature importances for better visualization 

feat_importances = pd.Series(model.feature_importances_, index=X.columns) 
feat_importances.nlargest(5).plot(kind='barh') 
plt.show()

## Model

In [None]:
# Use the random grid to search for best hyperparameters 
# First create the base model to tune 

from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()

In [None]:
## Hyperparameters
import numpy as np
n_estimators=[int(x) for x in np.linspace(start=100, stop=1200, num=12)]
print(n_estimators)

In [None]:
#Randomized Search CV

#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt'] 
#Haximum number of Levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num= 6)] 
#max_depth.append(None) #Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
#Minimum number of samples required at each teaf node
min_samples_leaf = [1, 2, 5, 10]
                

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Create the random_grid 

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features, 
               'max_depth': max_depth,
               'min_samples_split': min_samples_split, 
               'min_samples_leaf': min_samples_leaf}

print (random_grid)

In [None]:
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, scoring='neg_mean_squared_error', n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=1)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
y_pred=rf_random.predict(X_test)
y_pred

## Evaluation

In [None]:
from sklearn.metrics import mean_squared_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
sns.distplot(y_test-y_pred)

In [None]:
plt.scatter(y_test, y_pred)

## R-Square and Adjusted R-Square

In [None]:
import statsmodels.api as sm
X_addC = sm.add_constant(X_test)
result = sm.OLS(y_pred, X_addC).fit()
print(result.rsquared, result.rsquared_adj)