In [1]:
%reload_ext autoreload
%autoreload
%matplotlib inline

In [2]:
from fastai.tabular import *

In [3]:
path = Path('')

In [4]:
path.ls()

[PosixPath('Population by County - Copy of Sheet1.csv'),
 PosixPath('__pycache__'),
 PosixPath('Michigan Data - registrations.csv'),
 PosixPath('test.csv'),
 PosixPath('.ipynb_checkpoints'),
 PosixPath('Population by County - Copy of dataset.csv'),
 PosixPath('michigan.csv'),
 PosixPath('models'),
 PosixPath('Michigan Voters.ipynb'),
 PosixPath('ranger.py')]

### Random Forest Regression

https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [6]:
dfr = pd.read_csv(path/'Population by County - Copy of dataset.csv')

In [7]:
dfr.head()

Unnamed: 0,countyid,pop_minus2,popminu3,popminus4,popminus5,popminus6,popminus7,registrationsminus1,votes_total
0,1001,51328,49676,48366,46800,45909,44889,29755.0,23641
1,1003,168121,162183,156266,151509,147957,144875,98382.0,81413
2,1005,27861,28027,28287,28594,28653,28863,16391.0,11630
3,1007,22099,22042,21721,21399,21199,21028,12024.0,8644
4,1009,55485,54624,54124,53457,52551,51845,28920.0,24267


In [8]:
labels = np.array(dfr['votes_total'])

In [9]:
features= dfr.drop(['votes_total'], axis = 1)

In [10]:
features.head()

Unnamed: 0,countyid,pop_minus2,popminu3,popminus4,popminus5,popminus6,popminus7,registrationsminus1
0,1001,51328,49676,48366,46800,45909,44889,29755.0
1,1003,168121,162183,156266,151509,147957,144875,98382.0
2,1005,27861,28027,28287,28594,28653,28863,16391.0
3,1007,22099,22042,21721,21399,21199,21028,12024.0
4,1009,55485,54624,54124,53457,52551,51845,28920.0


In [11]:
feature_list = list(features.columns)

In [12]:
features = np.array(features)

In [13]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [14]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (7002, 8)
Training Labels Shape: (7002,)
Testing Features Shape: (2334, 8)
Testing Labels Shape: (2334,)


{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 110, 'bootstrap': True}

In [15]:
rf = RandomForestRegressor(max_depth=110,n_estimators = 1000, max_features='sqrt', random_state = 42)
rf.fit(train_features, train_labels);
predictions = rf.predict(test_features)
errors = abs(predictions - test_labels)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 4741.09 degrees.
Accuracy: 87.74 %.


In [16]:
test_features;

In [17]:
dft = pd.read_csv(path/'test.csv')

In [19]:
dft.shape

(3112, 8)

In [20]:
predictions = pd.DataFrame(rf.predict(dft))

In [22]:
predictions.to_csv('testresults.csv')

### Interpretation

In [27]:
fi = rf.feature_importances_

In [36]:
fi

array([0.003794, 0.169149, 0.162237, 0.161157, 0.176496, 0.143575, 0.14165 , 0.041942])

In [39]:
train_labels

array([ 16382,  40016,   5299,   6368, ...,  74029, 168709,   7191,  10579])

In [41]:
pd.DataFrame({'Variable':feature_list,
              'Importance':rf.feature_importances_}).sort_values('Importance', ascending=False)

Unnamed: 0,Variable,Importance
4,popminus5,0.176496
1,pop_minus2,0.169149
2,popminu3,0.162237
3,popminus4,0.161157
5,popminus6,0.143575
6,popminus7,0.14165
7,registrationsminus1,0.041942
0,countyid,0.003794


### Fine-Tuning the Model

https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [31]:
from pprint import pprint
pprint(rf.get_params())

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 110,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [32]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [33]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 20.9min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [50]:
rf_random.best_params_

NameError: name 'rf_random' is not defined

In [None]:
predict(trained_model, newdata)