In [1]:
import pandas as pd
import numpy as np

In [2]:
columns = [
    "mpg",
    "cylinders",
    "displacement",
    "horsepower",
    "weight",
    "acceleration",
    "model_year",
    "origin",
    "car_name",]

In [3]:
df = pd.read_csv('./data/auto-mpg.data', names=columns)
df[30:40]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
30,28.0,4,140.0,90.00,2264.0,15.5,71.0,1,"""chevrolet vega 2300"""
31,25.0,4,113.0,95.00,2228.0,14.0,71.0,3,"""toyota corona"""
32,25.0,4,98.0,?,,2046.0,19.0,71,1
33,19.0,6,232.0,100.0,2634.0,13.0,71.0,1,"""amc gremlin"""
34,16.0,6,225.0,105.0,3439.0,15.5,71.0,1,"""plymouth satellite custom"""
35,17.0,6,250.0,100.0,3329.0,15.5,71.0,1,"""chevrolet chevelle malibu"""
36,19.0,6,250.0,88.00,3302.0,15.5,71.0,1,"""ford torino 500"""
37,18.0,6,232.0,100.0,3288.0,15.5,71.0,1,"""amc matador"""
38,14.0,8,350.0,165.0,4209.0,12.0,71.0,1,"""chevrolet impala"""
39,14.0,8,400.0,175.0,4464.0,11.5,71.0,1,"""pontiac catalina brougham"""


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null object
weight          398 non-null object
acceleration    398 non-null float64
model_year      398 non-null float64
origin          398 non-null int64
car_name        398 non-null object
dtypes: float64(4), int64(2), object(3)
memory usage: 28.1+ KB


In [5]:
def clean_data(df):
    # create list of columns to convert to numeric
    columns_to_convert = ['horsepower', 'weight']
    clean_df = df
    # iterate over the columns in our custom list
    for column in columns_to_convert:
        # set dataframe to itself, with the converted values
        clean_df = clean_df[pd.to_numeric(clean_df[column], errors='coerce').notnull()] # non-numeric will be set to NaN, then removed
    return(clean_df)
    

In [6]:
df.shape

(398, 9)

In [7]:
clean_df = clean_data(df)

In [8]:
clean_df.shape

(392, 9)

In [9]:
clean_df[30:40]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
30,28.0,4,140.0,90.0,2264.0,15.5,71.0,1,"""chevrolet vega 2300"""
31,25.0,4,113.0,95.0,2228.0,14.0,71.0,3,"""toyota corona"""
33,19.0,6,232.0,100.0,2634.0,13.0,71.0,1,"""amc gremlin"""
34,16.0,6,225.0,105.0,3439.0,15.5,71.0,1,"""plymouth satellite custom"""
35,17.0,6,250.0,100.0,3329.0,15.5,71.0,1,"""chevrolet chevelle malibu"""
36,19.0,6,250.0,88.0,3302.0,15.5,71.0,1,"""ford torino 500"""
37,18.0,6,232.0,100.0,3288.0,15.5,71.0,1,"""amc matador"""
38,14.0,8,350.0,165.0,4209.0,12.0,71.0,1,"""chevrolet impala"""
39,14.0,8,400.0,175.0,4464.0,11.5,71.0,1,"""pontiac catalina brougham"""
40,14.0,8,351.0,153.0,4154.0,13.5,71.0,1,"""ford galaxie 500"""


In [10]:
print(clean_df['mpg'].nunique())
print(clean_df['mpg'].min())
print(clean_df['mpg'].max())

127
9.0
46.6


In [11]:
bins = [0,15,23,30,35,50]
bin_names = ["0-15", "15-23", "23-30", "30-35", "35+"]
clean_df["mpg_class"] = pd.cut(clean_df["mpg"], bins=bins, labels=bin_names)

In [12]:
# verify bins
clean_df[["mpg","mpg_class"]].head(10)

Unnamed: 0,mpg,mpg_class
0,18.0,15-23
1,15.0,0-15
2,18.0,15-23
3,16.0,15-23
4,17.0,15-23
5,15.0,0-15
6,14.0,0-15
7,14.0,0-15
8,14.0,0-15
9,15.0,0-15


In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder().fit(clean_df["mpg_class"])

In [15]:
clean_df["target"] = le.transform(clean_df["mpg_class"])

In [16]:
clean_df.sample(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name,mpg_class,target
308,33.5,4,151.0,90.0,2556.0,13.2,79.0,1,"""pontiac phoenix""",30-35,3
31,25.0,4,113.0,95.0,2228.0,14.0,71.0,3,"""toyota corona""",23-30,2
8,14.0,8,455.0,225.0,4425.0,10.0,70.0,1,"""pontiac catalina""",0-15,0
176,19.0,6,232.0,90.0,3211.0,17.0,75.0,1,"""amc pacer""",15-23,1
27,11.0,8,318.0,210.0,4382.0,13.5,70.0,1,"""dodge d200""",0-15,0
311,32.1,4,98.0,70.0,2120.0,15.5,80.0,1,"""chevrolet chevette""",30-35,3
2,18.0,8,318.0,150.0,3436.0,11.0,70.0,1,"""plymouth satellite""",15-23,1
190,14.5,8,351.0,152.0,4215.0,12.8,76.0,1,"""ford gran torino""",0-15,0
249,19.9,8,260.0,110.0,3365.0,15.5,78.0,1,"""oldsmobile cutlass salon brougham""",15-23,1
20,25.0,4,110.0,87.0,2672.0,17.5,70.0,2,"""peugeot 504""",23-30,2


In [17]:
clean_df.corr()

Unnamed: 0,mpg,cylinders,displacement,acceleration,model_year,origin,target
mpg,1.0,-0.777618,-0.805127,0.423329,0.580541,0.565209,0.96524
cylinders,-0.777618,1.0,0.950823,-0.504683,-0.345647,-0.568932,-0.73825
displacement,-0.805127,0.950823,1.0,-0.5438,-0.369855,-0.614535,-0.772535
acceleration,0.423329,-0.504683,-0.5438,1.0,0.290316,0.212746,0.400492
model_year,0.580541,-0.345647,-0.369855,0.290316,1.0,0.181528,0.594228
origin,0.565209,-0.568932,-0.614535,0.212746,0.181528,1.0,0.561862
target,0.96524,-0.73825,-0.772535,0.400492,0.594228,0.561862,1.0


In [18]:
X = clean_df.drop(columns=["mpg", "mpg_class", "target", "car_name"])
X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504.0,12.0,70.0,1
1,8,350.0,165.0,3693.0,11.5,70.0,1
2,8,318.0,150.0,3436.0,11.0,70.0,1
3,8,304.0,150.0,3433.0,12.0,70.0,1
4,8,302.0,140.0,3449.0,10.5,70.0,1


In [19]:
y = clean_df["target"]
y.head()

0    1
1    0
2    1
3    1
4    1
Name: target, dtype: int32

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
clf = RandomForestClassifier(n_estimators=200, random_state=0)

In [24]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [25]:
clf.score(X_test, y_test)

0.7244897959183674

In [26]:
import pickle

In [27]:
filename="./model/mpg_model"
pickle.dump(clf, open(filename, 'wb'))

In [None]:
np.save('./model/classes.npy', le.classes_)