**Machine learning project**

Predicting MRSP (manufacturer's suggested retail price)
* Read the cars CSV file and explore its columns
* Select features( explore mutual information and feature engineering to select and create features)
* select target
* Preprocess data:
1. Clean missing values in numeric features
2. Clean missing values in categoracal features
3. Encode categorical features
* Split data to train and validation (Single or cross validation?)
* Choose model (regression or classification)
* Model validation: create a function to test performance of the model
* Optimize model parameters (best leaf nodes, n estimators, patience, learning rate...) 
* Build final model train it on all data

In [1]:
import pandas as pd   
import numpy as np
# Load data
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
df = pd.read_csv('/content/gdrive/MyDrive/ML/cars.csv')

In [7]:
df.isnull().sum()

Make                 0
Model                0
Year                 0
Engine Fuel Type     0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Driven_Wheels        0
Number of Doors      0
Vehicle Size         0
Vehicle Style        0
highway MPG          0
city mpg             0
Popularity           0
MSRP                 0
dtype: int64

In [4]:
df.shape

(11914, 16)

In [5]:
df["Engine Fuel Type"].fillna("regular unleaded", inplace = True)
df.drop(columns=['Market Category'] ,inplace= True)
df["Number of Doors"].fillna(4, inplace = True)
m = df['Engine Cylinders'].isna()
df.loc[m, 'Engine Cylinders'] = np.where(df.loc[m, 'Engine Fuel Type'].eq('electric'), 0,4)


In [6]:
df.interpolate(method ='linear', limit_direction ='forward', inplace=True)

In [8]:
y = df.MSRP
X = df.drop(['MSRP'], axis=1)
from sklearn.model_selection import train_test_split
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)


categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique()> 1and 
                        X_train_full[cname].dtype == "object"]


numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]


my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
categorical_cols

['Make',
 'Model',
 'Engine Fuel Type',
 'Transmission Type',
 'Driven_Wheels',
 'Vehicle Size',
 'Vehicle Style']

No nan values

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [10]:
X_train

Unnamed: 0,Make,Model,Engine Fuel Type,Transmission Type,Driven_Wheels,Vehicle Size,Vehicle Style,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity
9030,GMC,Savana,flex-fuel (unleaded/E85),AUTOMATIC,rear wheel drive,Large,Passenger Van,2016,285.0,8.0,3.0,16,11,549
8817,GMC,S-15,regular unleaded,MANUAL,four wheel drive,Compact,Regular Cab Pickup,1990,160.0,6.0,2.0,19,15,549
7759,Audi,Q3,premium unleaded (recommended),AUTOMATIC,all wheel drive,Compact,4dr SUV,2017,200.0,4.0,4.0,28,20,3105
4896,Scion,FR-S,premium unleaded (required),AUTOMATIC,rear wheel drive,Compact,Coupe,2014,200.0,4.0,2.0,34,25,105
2151,Dodge,Caliber,regular unleaded,MANUAL,front wheel drive,Compact,Wagon,2010,172.0,4.0,4.0,29,23,1851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4859,Kia,Forte,regular unleaded,AUTOMATIC,front wheel drive,Compact,Sedan,2015,173.0,4.0,4.0,36,25,1720
3264,Cadillac,CTS,premium unleaded (required),AUTOMATIC,rear wheel drive,Large,Sedan,2016,420.0,6.0,4.0,24,16,1624
9845,Kia,Sorento,regular unleaded,AUTOMATIC,all wheel drive,Midsize,4dr SUV,2017,290.0,6.0,4.0,25,18,1720
10799,Mazda,Tribute Hybrid,regular unleaded,AUTOMATIC,all wheel drive,Compact,4dr SUV,2009,177.0,4.0,4.0,27,29,586


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [12]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [13]:
from sklearn.metrics import mean_absolute_error

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)

score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 3244.5917188833414


In [None]:
y.mean()

40594.737032063116

In [15]:
final_model = my_pipeline.fit(X, y)

In [17]:
import pickle/content/finalized_model.sav

In [18]:
filename = 'finalized_model.sav'
pickle.dump(final_model, open(filename, 'wb'))