In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

  import pandas.util.testing as tm


# Getting the Data

In [32]:
cd "/content/drive/My Drive/Colab Notebooks/ML_Projects/Car Sales"

/content/drive/My Drive/Colab Notebooks/ML_Projects/Car Sales


In [41]:
car_sales = pd.read_csv('data/car-sales-extended-missing-data.csv')
car_sales.head(5)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [42]:
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [43]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


#(a) Filling missing data with pandas

Remove rows with missing price value

In [46]:
car_sales.dropna(subset=['Price'],inplace=True)

In [47]:
car_sales_pd = car_sales

In [48]:
#Fill "Make" Column
car_sales_pd["Make"].fillna("missing",inplace=True)
#Fill "Colour" Column
car_sales_pd["Colour"].fillna("missing",inplace=True)
#Fill "Odometer (KM)" Column
car_sales_pd["Odometer (KM)"].fillna(car_sales_pd["Odometer (KM)"].mean(),inplace=True)
#Fill "Doors" Column
car_sales_pd["Doors"].fillna(4,inplace=True)


In [None]:
X_filled = car_sales.drop('Price',axis=1)
y = car_sales['Price']

#(b) Filling missing data with Sklearn - (alternative to a)

In [82]:
car_sales.dropna(subset=['Price'],inplace=True)

In [83]:
X = car_sales.drop('Price',axis=1)
y = car_sales['Price']

In [84]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Fill Categorical Values with 'missing' and numerical with mean
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy="constant",fill_value="missing")
door_imputer = SimpleImputer(strategy="constant",fill_value=4)

cat_features = ['Make', 'Colour']
num_features = ['Odometer (KM)']
door_features = ['Doors']


In [85]:
imputer = ColumnTransformer([('cat_imputer',cat_imputer,cat_features),
                   ('door_imputer',door_imputer,door_features),
                   ('num_imputer',num_imputer,num_features)],remainder='passthrough')
X_filled = imputer.fit_transform(X)

In [89]:
X_filled = pd.DataFrame(X_filled,columns=['Make', 'Colour', 'Odometer (KM)', 'Doors'])

#Encoding features

In [90]:
#categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                remainder = "passthrough")
transformed_X = transformer.fit_transform(X_filled)
transformed_X

<950x913 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [99]:
from sklearn.model_selection import train_test_split

In [100]:
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2, random_state=101)

#Choosing the Right Model and Hyperparameters

In [101]:
from sklearn.ensemble import RandomForestRegressor

In [102]:
clf = RandomForestRegressor()
#keeping the default hyperparameters

In [95]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

#Fit the model to data

In [27]:
clf.fit(X_train,y_train);

In [28]:
y_preds = clf.predict(X_test)

#Evaluating the Model

In [29]:
clf.score(X_train,y_train)

0.8711081959218923

In [30]:
clf.score(X_test,y_test)

0.32632136129855527

#Improve a model

In [None]:
#trying different amount for n_estimators

In [None]:
for i in range (10,100,10):
  print(f"trying model with {i} estimator...")
  clf = RandomForestClassifier(n_estimators=i,random_state=30).fit(X_train,y_train)
  print(f"Model accuracy on test set {clf.score(X_test,y_test)*100:.2f}%")
  print(" ")

trying model with 10 estimator...
Model accuracy on test set 75.41%
 
trying model with 20 estimator...
Model accuracy on test set 86.89%
 
trying model with 30 estimator...
Model accuracy on test set 86.89%
 
trying model with 40 estimator...
Model accuracy on test set 83.61%
 
trying model with 50 estimator...
Model accuracy on test set 86.89%
 
trying model with 60 estimator...
Model accuracy on test set 88.52%
 
trying model with 70 estimator...
Model accuracy on test set 86.89%
 
trying model with 80 estimator...
Model accuracy on test set 86.89%
 
trying model with 90 estimator...
Model accuracy on test set 86.89%
 


In [None]:
#best results with 60 estimators
clf = RandomForestClassifier(n_estimators=60,random_state=30).fit(X_train,y_train)
print(f"Model accuracy on test set {clf.score(X_test,y_test)*100:.2f}%")

Model accuracy on test set 88.52%


#Save Model

In [None]:
import pickle

In [None]:
pickle.dump(clf,open("random_forest_model_1.pk1","wb"))

In [None]:
loaded_model = pickle.load(open("random_forest_model_1.pk1","rb"))

In [None]:
print(f"Validation score of loaded model {loaded_model.score(X_test,y_test)*100:.2f}%")

Validation score of loaded model 88.52%
