# Begin copying

This first segment belongs to [Aakrit Singhal](https://www.kaggle.com/aakritsinghal). Thank you for the walk-through!

In [None]:
# Packages
import pandas as pd
import numpy as np
import os

# Data
data_path = '../input/vehicle-dataset-from-cardekho/car data.csv'
data = pd.read_csv(data_path)

In [None]:
data.head()

In [None]:
data[['Fuel_Type']].head()

In [None]:
data[['Car_Name']].head()

In [None]:
print(len(data))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

scatplot = sns.scatterplot(x=data.Year, y=data.Selling_Price)

In [None]:
data.groupby('Fuel_Type').count()

In [None]:
catplot = sns.swarmplot(x=data.Fuel_Type, y=data.Selling_Price)

In [None]:
_=sns.scatterplot(x=data.Kms_Driven, y=data.Selling_Price)

In [None]:
_=sns.swarmplot(x=data.Seller_Type, y=data.Selling_Price)
plt.figure()
_=sns.swarmplot(x=data.Transmission, y=data.Selling_Price)

In [None]:
from sklearn import linear_model

x = data.Year.values
x = x[:,np.newaxis]
y = data.Selling_Price.values

lm = linear_model.LinearRegression(fit_intercept = True)

lm.fit(x,y)

In [None]:
y_pred = lm.predict(x)
plt.plot(x, y_pred, color='red')

plt.scatter(x,y)
plt.xlabel('Year')
plt.ylabel('Selling Price')
plt.show()

In [None]:
print('Our m is %0.2f lahks/year'%lm.coef_)
print('Our b is %0.2f lahks/year'%lm.intercept_)

In [None]:
m = lm.coef_
b = lm.intercept_
age = 5
selling_price = m * age + b
print(selling_price)

In [None]:
# Transform Categorical variables into Numeric
car_data = data.copy()
car_data['TransmissionNumber'] = car_data.Transmission.replace({'Manual':1,'Automatic':0})

In [None]:
x2 = car_data[['Year', 'TransmissionNumber', 'Kms_Driven']]

lm2 = linear_model.LinearRegression(fit_intercept = True, normalize = True)

lm2.fit(x2, y)

y_pred2 = lm2.predict(x)

In [None]:
print('Our linear model score was %0.4f'%lm.score(x[:,[0]], y))

In [None]:
print('Our multiple linear model score was %0.4f'%lm2.score(x2,y))

In [None]:
# Categorical to numeric
car_data['Seller_TypeNumber'] = car_data.Seller_Type.replace({'Dealer':1,'Individual':0})

x3 = car_data[['Year', 'TransmissionNumber','Seller_TypeNumber','Kms_Driven']].values

lm3 = linear_model.LinearRegression(fit_intercept = True, normalize = True)

lm3.fit(x3,y)

print('Our multiple linear score was %0.4f'%lm3.score(x3,y))


# END Copying - Begin new material

I wanted to better understand how to take each of the categorical variables into consideration for an improved linear model.

I noticed that each of the categorical variables were nominal as there was no indication of order. I learned in the Kaggle micro-course, [Intermediate Machine Learning](https://www.kaggle.com/learn/intermediate-machine-learning), that there are several ways to handle categoricals based on the type of information contained in them. I chose a OneHotEncoder for all since none, in my opinion, were ordinal. It would make sense to use a LabelEncoder if there were a variable that was ordinal.

In [None]:
# Understand each Categorical Variable
s = (data.dtypes == 'object')
cat_cols = list(s[s].index)

for col in cat_cols:
    des = data[col].describe()
    print(des,'\n\n')

In [None]:
# We introduce One-Hot encoding for Categorical variables: Car_Name, Fuel_Type, Seller_Type, and Transmission
# Instead of converting each column into an Ordinal column, we convert them to Nominal. There may be an argument that
# Car_Name could be turned into Ordinal since Ritz > Mitsubishi, but I lack the knowledge to make the necessary calls.

# Load in sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = data.Selling_Price
x4 = data.drop('Selling_Price', axis=1)

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)
OH_cols_data = pd.DataFrame(OH_encoder.fit_transform(data[cat_cols]))

# One-hot encoding removed index; put it back
OH_cols_data.index = data.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_data = data.drop(cat_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_data = pd.concat([num_X_data, OH_cols_data], axis=1)

# Build model
y = OH_data.Selling_Price
x4 = OH_data.drop('Selling_Price', axis=1)

lm4 = linear_model.LinearRegression(fit_intercept = True, normalize = True)

lm4.fit(x4,y)

print('Our multiple linear model with OH Encoding score was %0.4f'%lm4.score(x4,y))

In [None]:
# Check for patterns in data that were not captured with linear model
y_pred4 = lm4.predict(x4)

sns.regplot(y_pred4,[y-y_pred4])
plt.xlabel('Fitted')
plt.ylabel('Residuals')
plt.show()

This plot tells me that the model does well to predict the price of the car if the Selling_Price is between 0-12 lehks. Afterwards, the model becomes unreliable. This is curious.

In [None]:
data_hp_pred = pd.DataFrame(data=y_pred4,columns=["Predictions"])
data_hp_join = pd.concat([data, data_hp_pred], axis=1)
data_hp = data_hp_join[data_hp_join['Selling_Price'] > 10]
print(data_hp)