In [1]:
%matplotlib inline
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype, is_bool_dtype

In [2]:
data = pd.read_csv('imports-data.txt', names = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors',
          'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width',
          'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system',
          'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg',
          'price'])

In [5]:
data = data.replace('?',np.nan)

In [7]:
data['price'] = data['price'].astype('float64')

# Categories

In [8]:
for n,c in data.items():
    if is_string_dtype(c): data[n] = c.astype('category').cat.as_ordered()
    elif is_bool_dtype(c): data[n] = c.astype('category').cat.as_ordered()

In [9]:
for n,c in data.items():
    if is_categorical_dtype(c): data[n] = data[n].cat.codes


# Missing Values

In [14]:
#fill missing values of categoricals with median
for n,c in data.items():
    if is_numeric_dtype(c):
        data[n] = c.fillna(c.median())

# Split dependent variable into a separate variable

In [15]:
y = data['price'].values
df = data.drop('price',axis=1)

# Create train and validation set

In [16]:
#split randomly
X_train, X_valid, y_train, y_valid = train_test_split(df,y,test_size = .2)

In [17]:
m = RandomForestRegressor()
m.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [18]:
m.score(X_valid,y_valid)

0.56686141372776422

In [19]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
rmse(m.predict(X_valid),y_valid)

4357.3385028650382

# Random Search

In [22]:
#max_features, min_sample_leaf
random_mf = np.random.uniform(0,1,size = 100)
random_ml = np.random.randint(1,100, size = 100)

rmse_list = []
for i in range(100):
    m = RandomForestRegressor(n_estimators = 40, min_samples_leaf=random_ml[i], max_features=random_mf[i])
    m.fit(X_train, y_train)
    rmse_list.append(rmse(m.predict(X_valid),y_valid))

In [24]:
np.argmin(rmse_list)

87

In [26]:
m = RandomForestRegressor(n_estimators = 40, min_samples_leaf=random_ml[np.argmin(rmse_list)], max_features=random_mf[np.argmin(rmse_list)])
m.fit(X_train, y_train)
rmse(m.predict(X_valid),y_valid)

3657.0410332698107

# Feature Importance

In [27]:
fi = pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [29]:
fi

Unnamed: 0,cols,imp
16,engine-size,0.235097
13,curb-weight,0.191518
23,city-mpg,0.124908
24,highway-mpg,0.097268
11,width,0.093019
10,length,0.057196
7,drive-wheels,0.044605
9,wheel-base,0.038569
21,horsepower,0.025229
17,fuel-system,0.025015
