# Car Price Prediction - A Training Exercise

## Can I predict car prices? What features drive the price of the car?

In [None]:
# Import needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Read csv file into a pandas dataframe
df = pd.read_csv(r'../input/car-data/CarPrice_Assignment.csv')

In [None]:
# First 10 lines
df.head(10)

In [None]:
# describe dataftame
df.describe()

In [None]:
# Check features and their data types
for col in df.columns:
    print(col, df[col].dtype)

In [None]:
# Scan for null values
df.isnull().sum()

## Numerical Data Analysis

In [None]:
# Check the correlation of numerical features(features that have numeric values) with Price. Sort in descending order.
c = df.corr().abs()
c = c.unstack()
c.sort_values()
c['price'].sort_values(ascending = False)[1:]

In [None]:
# Visualize the correlation matrix of numerical features
cor = df.corr()
# Create a mask to hide the upper triangle of the plot
mask = np.triu(np.ones_like(cor, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(10, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(240, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(cor, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### Hypothesis: 
#### Car price is positively correlated with: enginesize, curbweight, horsepower, carwidth
#### Car price is negatively correlated with: citympg, highwaympg (The price increases as the feature value decreases and vice versa)

In [None]:
# Visualize the hypothesis
fig, ((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(2,3, sharey = True, figsize = (10,10))
fig.suptitle('The variation of car price vs selected features', fontsize=16)
sns.set_theme(style="whitegrid")
sns.despine(fig)  # remove the right and upper spines of the figure

ax1.plot(df.enginesize, df.price, 'o', color = 'red', alpha = 0.5)
ax1.set_title('Engine Size')

ax2.plot(df.curbweight, df.price,'+', color = 'b')
ax2.set_title('Curb Weight')

sns.scatterplot(df.horsepower, df.price, ax = ax3)
ax3.set(xlabel = None)
ax3.set_title('Horsepower')

ax4.plot(df.carwidth, df.price, 'v', color = 'yellow')
ax4.set_title('Car Width')

ax5.plot(df.citympg, df.price, '<', color = 'green')
ax5.set_title('City mpg')

ax6.plot(df.highwaympg, df.price, 'x', color = 'orange')
ax6.set_title('Highway mpg')

## Categorical Data Analysis

In [None]:
# make a dataframe out of the categorical data
cat_df = df.select_dtypes(['object'])
# what are the selected features
cat_df.columns

In [None]:
# create dummies(encode categories with binary values to represent them in a numerical form)
cat_df = pd.get_dummies(cat_df)
cat_df.head()

In [None]:
# add the price column to the categorical frame
cat_df['price'] = df['price']

In [None]:
# List the first 20 categories that have the highest correlation with price
cat_df.corr().price.sort_values(ascending = False)[:20]

### Hypothesis: 
#### drivewheel, fuelsystem, cylindernumber are correlated with price
#### The brand of the car is not included in these top correlated features. Can you predict the price without knowing the brand?

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(1,3, sharey = True, figsize = (9,4))
fig.suptitle('The variation of car price vs selected features', fontsize=16)
sns.despine(fig)

ax1.bar(df.drivewheel, df.price)
ax1.set_title('Drive Wheel')

ax2.bar(df.cylindernumber, df.price, color = 'red')
ax2.tick_params(rotation = 90)
ax2.set_title('Cylinders')

ax3.bar(df.fuelsystem, df.price, color = 'green')
ax3.tick_params(rotation = 90)
ax3.set_title('Fuel System')


## Normalizing and Splitting the Data with the Elected Features Only

In [None]:
# Let's merge the numeric and categorical dataframes and drop the old 'object' columns
encoded_df = pd.merge(df.select_dtypes(['number']), cat_df, how = 'left', on = 'price')
# drop Car_ID to avoid data leakage
encoded_df.drop(columns = ['car_ID'], inplace = True)
encoded_df.head()

In [None]:
# Select the features from the hypothesis
features = ['curbweight', 'enginesize', 'horsepower', 'carwidth', 'citympg', 'highwaympg', 'drivewheel_rwd',
            'fuelsystem_mpfi', 'cylindernumber_eight', 'cylindernumber_six' ]
X = encoded_df[features]
y = encoded_df['price']

In [None]:
# split the data into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [None]:
# Scale the data using the same scaler 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_transformed = scaler.fit_transform(X_train)
X_test_transformed = scaler.transform(X_test)

## Model Selection

In [None]:
# build a model and cross validate and check model scores
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

linreg = LinearRegression()
# divide data into 5 folds of test/train pairs
cv_scores = cross_val_score(linreg, X_train_transformed, y_train, cv = 5) 

In [None]:
# Check the average scores of linear regression
np.mean(cv_scores)

In [None]:
# Check the scores of RandomForestClassifier with different max_features 2 to 5
from sklearn.ensemble import RandomForestRegressor

score_list = []
for i in range(2,6):
    forest = RandomForestRegressor(max_features = i, random_state = 0)
    cv_scores = cross_val_score(forest, X_train_transformed, y_train, cv = 5)
    iscore = (i, np.mean(cv_scores))
    score_list.append(iscore)

score_list

## Build and Test the Model

In [None]:
# build randomforest
clf = RandomForestRegressor(max_features = 4).fit(X_train_transformed, y_train)

# get the scores 
train_score = clf.score(X_train_transformed, y_train)
test_score = clf.score(X_test_transformed, y_test)

# print scores
print(f'Train score: {train_score}')
print(f'Test score : {test_score}')

## With a test score of 0.921 the model could predict the price of the car without considering its brand. Does that mean you can predict the price in real time the same way?

### You are welcome to comment on this.