In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [4]:
data = data.query('pickup_latitude > 40.7 and pickup_latitude < 40.8 and ' +
                  'dropoff_latitude > 40.7 and dropoff_latitude < 40.8 and ' +
                  'pickup_longitude > -74 and pickup_longitude < -73.9 and ' +
                  'dropoff_longitude > -74 and dropoff_longitude < -73.9 and ' +
                  'fare_amount > 0')

In [5]:
y = data.fare_amount

base_features = ['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude',]

x = data[base_features]

train_x, val_x, train_y, val_y = train_test_split(x, y, random_state=123)

first_model = RandomForestRegressor(n_estimators=30, random_state=123).fit(train_x, train_y)

In [6]:
perm = PermutationImportance(first_model, random_state=123).fit(val_x, val_y)

In [7]:
eli5.show_weights(perm, feature_names = val_x.columns.tolist())

Weight,Feature
0.8064  ± 0.0242,dropoff_latitude
0.7942  ± 0.0430,pickup_latitude
0.5396  ± 0.0418,pickup_longitude
0.4747  ± 0.0206,dropoff_longitude


In [8]:
# create some new features
data['longitude_diff'] = abs(data.dropoff_longitude - data.pickup_longitude)
data['latitude_diff'] = abs(data.dropoff_latitude - data.pickup_latitude)
data['distance'] = data.longitude_diff **2 + data.latitude_diff ** 2

more_features = ['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude',
                 'passenger_count',
                 'longitude_diff',
                 'latitude_diff',
                 'distance']

In [9]:
x2 = data[more_features]

train_x2, val_x2, train_y, val_y = train_test_split(x2, y, random_state=123)

second_model = RandomForestRegressor(n_estimators=30, random_state=123).fit(train_x2, train_y)

perm2 = PermutationImportance(second_model, random_state=123).fit(val_x2, val_y)

In [10]:
eli5.show_weights(perm2, feature_names = val_x2.columns.tolist())

Weight,Feature
0.9047  ± 0.0216,distance
0.2038  ± 0.0068,latitude_diff
0.0838  ± 0.0038,longitude_diff
0.0286  ± 0.0107,dropoff_longitude
0.0274  ± 0.0189,dropoff_latitude
0.0265  ± 0.0102,pickup_latitude
0.0251  ± 0.0126,pickup_longitude
0.0015  ± 0.0068,passenger_count
