In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
sns.set_style("whitegrid")

In [None]:
data = pd.read_csv('/kaggle/input/car-dekho-data/car data.csv')
data.head()

In [None]:
data.info()

In [None]:
df_num = data.select_dtypes(include=[np.float64, np.int64])
print("Columns with numerical data:")
for col in df_num.columns:
    print(col)

In [None]:
df_cat = data.select_dtypes(include = ['object'])
print("Columns with categorical data:")
for col in df_cat.columns:
    print(col)

In [None]:
data.describe()

In [None]:
# Correlation between numerical columns
plt.figure(figsize = (10,8))
sns.heatmap(data.corr(), annot = True)

There is strong correlation between Present Price of car and Selling Price of car which is obvious. Also there is good correlation between Year and Selling Price

### Univarate data Analysis of data

In [None]:
def plot_bar(column):
    plt.figure(figsize = (12,5))
    sns.countplot(data[column])
    plt.title(column, fontsize = 20)
    plt.xticks(fontsize = 14)
    plt.xlabel('')
    plt.show()

In [None]:
for col in ['company','Fuel_Type','Seller_Type','Transmission']:
    plot_bar(col)

In [None]:
def plot_hist(column):
    plt.figure(figsize = (12,5))
    sns.distplot(data[column])
    plt.title(column, fontsize = 20)

In [None]:
for col in ['Year','Selling_Price','Present_Price','Kms_Driven']:
    plot_hist(col)

### Basic Analysis

In [None]:
print("Car sold by Owner number\n{}".format(data["Owner"].value_counts()))

Most of car sold have only one previous owner. 

In [None]:
print("Number of cars sold through dealers: {}".format(data.loc[data.Seller_Type == 'Dealer']['Seller_Type'].value_counts().sum()))
print("Number of cars sold without dealers: {}".format(data.loc[data.Seller_Type == 'Individual']['Seller_Type'].value_counts().sum()))

In [None]:
print("Total number of cars with manual transmission: {}".format(data.loc[data.Transmission == 'Manual']['Car_Name'].value_counts().sum()))
print("Total number of cars with automatic transmission : {}".format(data.loc[data.Transmission == 'Automatic']['Car_Name'].value_counts().sum()))

In [None]:
print("Most Popular car Companys\n{}".format(data["company"].value_counts()))

In [None]:
print("Most sold cars\n{}".format(data["Car_Name"].value_counts().nlargest(15)))

### Bivariate Data Analysis

In [None]:
plt.figure(figsize = (10,8))
sns.boxplot(x = data["Selling_Price"], y = data["Seller_Type"])
plt.ylabel("Seller_Type",fontsize = 15)
plt.xlabel("Selling_Price", fontsize = 15)
plt.show()

Most of Car sold by dealers generally get higher Price than sold by individuals

Car are sold generally through a dealer

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(x = data["Selling_Price"], y = data["Fuel_Type"])
plt.ylabel("Fuel Type",fontsize = 15)
plt.yticks(fontsize = 14)
plt.xlabel("Selling_Price", fontsize = 15)
plt.show()

It is Clear from boxplot that generally Diesel Type car are sold at higher price. Mean selling Price of Diesel Price is higher than Petrol type cars . While Boxplot of CNG cars is like a line, it shows only few CNG car are sold

Petrol Type car are most sold but price of diesel car is higher than others.

In [None]:
plt.figure(figsize = (12,8))
sns.boxplot(x = data["Selling_Price"], y = data["Transmission"])
plt.ylabel("Transmission",fontsize = 15)
plt.yticks(fontsize = 14)
plt.xlabel("Selling_Price", fontsize = 15)
plt.show()

Car with automatic transmission have a large range of sales and third quarter of it is quite big. It shows that many of cars sold falls in this range. There is in comprasion small difference between median of Manual and Automatic Cars, because some Manual Car are sold at very high Price.

More than 250 car sold are manual while less than 50 are Automatic

In [None]:
keys = [year for year, df in data.groupby(["Year"])]
plt.figure(figsize = (14,8))
sns.barplot(keys, data.groupby(["Year"]).count()["Car_Name"])
plt.xlabel("Year",fontsize = 15)
plt.ylabel("Number of car sold",fontsize = 15)

In [None]:
data['Car_Name'].value_counts().nlargest(10).plot(kind = 'bar', figsize = (14,8))

In [None]:
fig, axes = plt.subplots(1,3, figsize = (18,7))
df = data.loc[~data.company.isin(['bajaj','hero','yamaha','tvs'])]
df1 = data.loc[data.company.isin(['bajaj','hero','yamaha','tvs'])]
sns.histplot(data["Selling_Price"], ax = axes[0])
sns.histplot(df["Selling_Price"], ax = axes[1])
sns.histplot(df1["Selling_Price"], ax = axes[2])

In [None]:
plt.figure(figsize = (10,7))
sns.histplot(df["Selling_Price"], color = 'blue',label = 'Cars')
sns.histplot(df1["Selling_Price"], color = 'red', label = 'Two whellers')

### Preparing Data

In [None]:
# Chosing some features to train  the models on
X = data.loc[:,["Year","Kms_Driven","Fuel_Type","Seller_Type","Transmission","Owner"]]
y = data.loc[:,['Selling_Price']]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Applying Label Encoding 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in ['Fuel_Type','Seller_Type','Transmission']:
    X_train[col] = encoder.fit_transform(X_train[col])
    X_test[col] = encoder.transform(X_test[col])

#### Data After Applying Label Encoder becomes numerical. Because Linear Regression and SVM models don't work on categorical data.

In [None]:
X_train.head(10)

In [None]:
# Applying Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train.loc[:,['Year','Kms_Driven']] = scaler.fit_transform(X_train.loc[:,['Year','Kms_Driven']])
X_test.loc[:,['Year','Kms_Driven']] = scaler.transform(X_test.loc[:,['Year','Kms_Driven']])

In [None]:
# Cross Validation
from sklearn.model_selection import cross_val_score
def val_score(model, X, y):
    score = -1*cross_val_score(model, X, y, cv = 5, scoring = 'neg_mean_squared_error')
    print("RMSE  : {}".format(np.sqrt(score)))
    print("Average error : {}".format(np.sqrt(score.mean())))

In [None]:
from sklearn.metrics import mean_squared_error
def plot_learning_curves(model, X, y, ylim = None):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))
        val_errors.append(mean_squared_error(y_val_predict, y_val))
    plt.figure(figsize = (10,7))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.xlabel("Training set size")
    plt.ylabel("RMSE")
    plt.ylim(ylim)
    plt.legend()

## Testing Models perfomance with their default features

## 1.Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
val_score(linear_reg, X_train, y_train)
plot_learning_curves(linear_reg, X_train, y_train)

In [None]:
# Error of Decision Tree on train set and Test set
linear_reg.fit(X_train, y_train)

prediction_train = linear_reg.predict(X_train)
train_error = mean_squared_error(prediction_train, y_train)

prediction_test = linear_reg.predict(X_test)
test_error = mean_squared_error(prediction_test, y_test)

print("Error on training set", train_error)
print("Error on test set",test_error )

### Clearly Linear Regression model is Underfiting as train error is greater than val error

## 2.SVM Linear

In [None]:
from sklearn.svm import LinearSVR
linear_svr = LinearSVR()
val_score(linear_svr, X_train, y_train)
plot_learning_curves(linear_svr, X_train, y_train)

## 3.SVM Kernel

In [None]:
from sklearn.svm import SVR
svr_reg = SVR(kernel = 'rbf')
val_score(svr_reg, X_train, y_train)
plot_learning_curves(svr_reg, X_train, y_train)

### This Model is also underfitting, as train error and test error both are high.SVM Kernel works better when training data have more features

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
val_score(tree_reg, X_train, y_train)
plot_learning_curves(tree_reg, X_train, y_train)

In [None]:
# Error of Decision Tree on train set and Test set
tree_reg.fit(X_train, y_train)

prediction_train = tree_reg.predict(X_train)
train_error = mean_squared_error(prediction_train, y_train)

prediction_test = tree_reg.predict(X_test)
test_error = mean_squared_error(prediction_test, y_test)

print("Error on training set", train_error)
print("Error on test set",test_error )

### Decision Tree is too complex and overfitting. It predicts nearly perfect values for Train data but performing very poor on test data. Using more data and hyperparameters tunning  can make it better.

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
val_score(rf_reg, X_train, y_train)
plot_learning_curves(rf_reg, X_train, y_train)

### Learning Curve of Random Forest model seems better. Adding more data can make it better as its curves are going down

In [None]:
# Error of Random Forest on train set and Test set
rf_reg.fit(X_train, y_train)

prediction_train = rf_reg.predict(X_train)
train_error = mean_squared_error(prediction_train, y_train)

prediction_test = rf_reg.predict(X_test)
test_error = mean_squared_error(prediction_test, y_test)

print("Error on training set", train_error)
print("Error on test set",test_error )

### Random Forest seems best model according to learning curves. 
### So let's apply grid search to chose best hyperparameters for it.

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3, 10, 30, 100, 300], 'max_features': [2, 4, 6, 8,10]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=3,
scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters",grid_search.best_params_)
print("Best Estimators",grid_search.best_estimator_)

In [None]:
forest_reg = RandomForestRegressor(max_features = 2, n_estimators = 30)
forest_reg.fit(X_train, y_train)

prediction_train = forest_reg.predict(X_train)
train_error = mean_squared_error(prediction_train, y_train)

prediction_test = forest_reg.predict(X_test)
test_error = mean_squared_error(prediction_test, y_test)

print("Error on training set", train_error)
print("Error on test set",test_error )