In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib
np.__version__, pd.__version__, sns.__version__, matplotlib.__version__

**1. Load Data**

In [None]:
df = pd.read_csv('dataset/cars.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

**2 Explatory Data Analysis**

2.1 Sanitizing the data

First all the unnecessary feature data are to be removed such as units

In [None]:
# Here we define a function to return float values for features with pattern "floatvalue + unit"
# Example for feature km_driven, if data is "12345 km", then 12345.00 is returned

def getFloatValues(featureValues):
    # all values are converted to string in case there are any float or integer values
    featureValues = featureValues.astype(str)

    # the first part of values are separated and converted to float values and mapped
    # in case the values can not be converted to float, then values are set to 0
    for index, x in enumerate(featureValues):
        try:
            featureValues[index] = float(x.split(' ')[0])
        except ValueError:
            featureValues[index] = 0

    return featureValues

In [None]:
# For feature name, the brand name of the car is kept. The first word in the name is assumed to be brand name
df['name'] = df['name'].map(lambda x : x.split(' ')[0])

# For feature mileage, the unit kmpl is removed and values converted into float values
df['mileage'] = getFloatValues(df['mileage'])
df['mileage'] = df['mileage'].astype('float')

# For feature engine, the unit CC is removed and values converted into float values
df['engine'] = getFloatValues(df['engine'])
df['engine'] = df['engine'].astype('float')

# For feature max_power, the unit bhp is removed and values converte into float values
df['max_power'] = getFloatValues(df['max_power'])
df['max_power'] = df['max_power'].astype('float')

# For feature torque, it is dropped due insignifcance to car company
df = df.drop('torque', axis = 1)

# For feature fuel, all the rows with values LPG and CNG are removed
df = df[~df['fuel'].isin(['CNG', 'LPG'])]

df.head()

In [None]:
df.shape

In [None]:
df.info()

2.2 Univariate analysis

Countplot

In [None]:
# Let's see how many individual and dealer sellers are there
sns.countplot(data = df, x = 'seller_type')

Distribution plot

In [None]:
# Distribution plot for selling prices
sns.displot(data = df, x = 'selling_price')

In [None]:
bigSellingPrices = df['selling_price'].map(lambda x: x if x > 500000 else 0)
bigSellingPrices = bigSellingPrices[bigSellingPrices > 1000000]
bigSellingPrices.max()

2.2 Multivariate Analysis

Multiple variable exploratory analysis

Boxplot

In [None]:
# Box plot for 'owner' and 'selling_price'
sns.boxplot(x = df["owner"], y = df["selling_price"]);
plt.ylabel("Selling Price")
plt.xlabel("Owner")

Scatterplot

In [None]:
# Scatter plot for mileage and selling price with respect to fuel type

sns.scatterplot(x = df['mileage'], y = df['selling_price'], hue =df['fuel'])

Corelation Matrix

In [None]:
# Let's check out heatmap

plt.figure(figsize = (15, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")

Currently, feature max_power and engine have shown strong correlation to selling price. However, the above graph does not include categorical features.

Label Encoding

Lets encode the labels for the present categorical featues

In [None]:
# Importing the LabelEncoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

Label Encoding for seller type

In [None]:
df["seller_type"] = le.fit_transform(df["seller_type"])
df["seller_type"].unique()

In [None]:
le.classes_

In [None]:
le.transform(["Dealer", "Individual", "Trustmark Dealer"])

Label Encoding for fuel type

In [None]:
df['fuel'] = le.fit_transform(df['fuel'])
df['fuel'].unique()

In [None]:
le.classes_

In [None]:
le.transform(['Diesel', 'Petrol'])

Label Encoding for feature transmission

In [None]:
df['transmission'] = le.fit_transform(df['transmission'])
df['transmission'].unique()

In [None]:
le.classes_

In [None]:
le.transform(['Automatic', 'Manual'])

Label Encoding for feature owner

In [None]:
df['owner'].unique()

In [None]:
mapping = {
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4,
    'Test Drive Car': 5
}

df['owner'] = df['owner'].map(lambda x : mapping[x])

# Removing the rows with Test Drive Car value
df = df[~df['owner'].isin([5])]

Label Encoding for feature name

In [None]:
# Lets look into how many unqiue brand names we have
df['name'].unique()

In [None]:
df['name'] = le.fit_transform(df['name'])
df['name'].unique()

In [None]:
le.classes_

In [None]:
le.transform(['Ambassador', 'Ashok', 'Audi', 'BMW', 'Chevrolet', 'Daewoo',
       'Datsun', 'Fiat', 'Force', 'Ford', 'Honda', 'Hyundai', 'Isuzu',
       'Jaguar', 'Jeep', 'Kia', 'Land', 'Lexus', 'MG', 'Mahindra',
       'Maruti', 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Opel',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'])

Now that all the categorical labels have been encoded into integer values, lets look into our current data

In [None]:
df.head()

Now the correlation matrix will display the values for these converted features as well

In [None]:
plt.figure(figsize = (15, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")

The most correlated features are still found to be engine and max_power

**Predictive Power Socre**

Let's check the predictive power scores of features. This graph plots the direct predictive power of a feature against another feature.

In [None]:
import ppscore as pps

dfcopy = df.copy()

matrix_df = pps.matrix(dfcopy)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')

#plot
plt.figure(figsize = (15,8))
sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)

**Feature Selection**

In [None]:
# According the PPS graph, the most strong features are engine and max_power with scores 0.45 and 0.62 respectively
# Therefore, x is set to those features

X = df[['max_power', 'mileage']]

# y is the selling price. As selling price values are too big, they will transformed with log
y = np.log(df['selling_price'])

**Test Train Split**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

**Preprocessing**

Nullvalues

In [None]:
# Checking for null values

X_train[['max_power', 'mileage']].isna().sum()

In [None]:
X_test[['max_power', 'mileage']].isna().sum()

In [None]:
y_train.isna().sum()
y_test.isna().sum()

In [None]:
sns.displot(data=df, x='engine')

In [None]:
sns.displot(data=df, x = 'max_power')

In [None]:
sns.displot(data=df, x = 'mileage')

In [None]:
sns.displot(y_train)

In [None]:
#let's fill the training set first!
# X_train['engine'].fillna(X_train['engine'].median(), inplace=True)
X_train['max_power'].fillna(X_train['max_power'].median(), inplace=True)
X_train['mileage'].fillna(X_train['mileage'].median(), inplace=True)

In [None]:
# X_test['engine'].fillna(X_test['engine'].median(), inplace=True)
X_test['max_power'].fillna(X_test['max_power'].median(), inplace=True)
X_test['mileage'].fillna(X_test['mileage'].median(), inplace=True)

In [None]:
# Checking for null values

X_train[['max_power', 'mileage']].isna().sum()

In [None]:
X_test[['max_power', 'mileage']].isna().sum()

**Checking Outliers**

In [None]:
# Create a dictionary of columns

col_dict = {'max_power': 1, 'mileage': 2}

# Box plots to detect outliers in each variables

for variable, i in col_dict.items():
  plt.subplot(5,4,i)
  plt.boxplot(X_train[variable])
  plt.title(variable)

plt.show()

In [None]:
def outlier_count(col, data = X_train):

    # calculate your 25% quatile and 75% quatile
    q75, q25 = np.percentile(data[col], [75, 25])

    # calculate your inter quatile
    iqr = q75 - q25

    # min_val and max_val
    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)

    # count number of outliers, which are the data that are less than min_val or more than max_val calculated above
    outlier_count = len(np.where((data[col] > max_val) | (data[col] < min_val))[0])

    # calculate the percentage of the outliers
    outlier_percent = round(outlier_count/len(data[col])*100, 2)

    if(outlier_count > 0):
        print("\n"+15*'-' + col + 15*'-'+"\n")
        print('Number of outliers: {}'.format(outlier_count))
        print('Percent of data that is outlier: {}%'.format(outlier_percent))

In [None]:
for col in X_train.columns:
    outlier_count(col)

**Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler

# feature scaling helps improve reach convergence faster
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

In [None]:
# Let's check shapes of all X_train, X_test, y_train, y_test
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)

**Modeling**

In [None]:
from sklearn.linear_model import LinearRegression  #we are using regression models
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)
yhat = lr.predict(X_test)

print("MSE: ", mean_squared_error(y_test, yhat))
print("r2: ", r2_score(y_test, yhat))

Cross validation + Grid Search

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Libraries for model evaluation

# models that we will be using, put them in a list
algorithms = [
    LinearRegression(),
    SVR(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(random_state = 0),
    RandomForestRegressor(n_estimators = 200, random_state = 42),
    XGBRegressor(n_estimators = 200, random_state = 42)
]

# The names of the models
algorithm_names = [
    "Linear Regression",
    "SVR",
    "KNeighbors Regressor",
    "Decision-Tree Regressor",
    "Random-Forest Regressor",
    "XGBregressor"
]

In [None]:
y_train.isna().sum()

In [None]:
from sklearn.model_selection import KFold, cross_val_score

#lists for keeping mse
train_mse = []
test_mse = []

#defining splits
kfold = KFold(n_splits=10, shuffle=True)

for i, model in enumerate(algorithms):
    scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
    print(f"{algorithm_names[i]} - Score: {scores}; Mean: {scores.mean()}")

In [None]:
# Here we find the Random Foest Regressor has lowest score so we find the best version of the model

from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [5, 10, None],
              'n_estimators': [100, 200, 300, 400, 500],
              'learning_rate': [0.1, 0.2]
}

xgb = XGBRegressor(random_state = 42)

grid = GridSearchCV(estimator = xgb,
                    param_grid = param_grid,
                    cv = kfold,
                    n_jobs = -1,
                    return_train_score=True,
                    refit=True,
                    scoring='neg_mean_squared_error')

# Fit your grid_search
grid.fit(X_train, y_train);  #fit means start looping all the possible parameters

In [None]:
# # Here we find the Random Foest Regressor has lowest score so we find the best version of the model

# from sklearn.model_selection import GridSearchCV

# param_grid = {'bootstrap': [True], 'max_depth': [5, 10, None],
#               'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]}

# rf = RandomForestRegressor(random_state = 1)

# grid = GridSearchCV(estimator = rf,
#                     param_grid = param_grid,
#                     cv = kfold,
#                     n_jobs = -1,
#                     return_train_score=True,
#                     refit=True,
#                     scoring='neg_mean_squared_error')

# # Fit your grid_search
# grid.fit(X_train, y_train);  #fit means start looping all the possible parameters

In [None]:
grid.best_params_

In [None]:
best_mse = grid.best_score_
best_mse

**Testing**

In [None]:
yhat = grid.predict(X_test)

mean_squared_error(y_test, yhat)

In [None]:
yhat_exp = np.exp(yhat)
y_test_exp = np.exp(y_test)

mean_squared_error(y_test_exp, yhat_exp)

**Inference**

In [None]:
import pickle

filename = 'app/model/selling-price.model'
pickle.dump(grid, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
df[['max_power', 'mileage', 'selling_price']].loc(1)

In [None]:
predicted_selling_price = loaded_model.predict(np.array([[74, 23]]))

In [None]:
predicted_selling_price

In [None]:
np.exp(predicted_selling_price)