# Random Forest Regressor
### Importing data:

In [1]:
import pandas as pd
from numpy import nan
import numpy as np
import scipy.stats as stats
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
violations = pd.read_pickle('data/violations.pkl')

In [3]:
pd.options.mode.chained_assignment = None
violations['last1'] = nan
violations['last2'] = nan
violations['last3'] = nan
#violations = violations[violations.delay > 14]
for row in range(len(violations)):
    ri = violations.restaurant_id.iloc[row]
    t = violations.date.iloc[row]
    last = violations[
        (violations.date <= t - timedelta(days = 1)) &
        (violations.restaurant_id == ri)
    ].sort_values('date', ascending = False)
    if len(last) > 0:
        violations['last1'].iloc[row] = last['*'].iloc[0]
        violations['last2'].iloc[row] = last['**'].iloc[0]
        violations['last3'].iloc[row] = last['***'].iloc[0]

In [4]:
violations = violations.sample(frac = 1)
X = violations.drop([
        'date', 'restaurant_id', '*', '**', '***', 'name', 'reviews', 'rating'
    ],1).fillna(0)
y = violations[['*', '**', '***']]

### Cross-validated hyperparameter tuning:

In [5]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

def WRMSLE(clf, X, y):
    W = np.array([[1],[2],[5]])
    return -np.sqrt(
        sum(
            (
                np.log(np.dot(y.values, W) + 1) - np.log(np.dot(clf.predict(X).round(), W) + 1)
            )**2
        )/len(y)
    )[0]

Xtrain = X[violations.date < '2015-06-17']
Xtest = X[violations.date >= '2015-06-17']
ytrain = y[violations.date < '2015-06-17']
ytest = y[violations.date >= '2015-06-17']

gs = GridSearchCV(
        estimator = RandomForestRegressor(n_estimators = 300),
        param_grid = {
            'max_features' : np.arange(5, 51, 5),
            'min_samples_leaf' : np.arange(1, 23, 3),
            'min_samples_split' : np.arange(4, 12, 1),
        },
        n_jobs = -1,
        scoring = WRMSLE
)
gs.fit(Xtrain, ytrain)
clf = gs.best_estimator_
gs.best_params_

{'max_features': 35, 'min_samples_leaf': 4, 'min_samples_split': 8}

### Evaluation:
* Accuracies on training and test sets
* ROC curve and its AUC
* Precision-recall curve and its AUC
* Learning curve

In [6]:
clf.score(Xtrain, ytrain), clf.score(Xtest, ytest)

(0.79248752179609749, 0.26210021748245071)

In [7]:
WRMSLE(clf, Xtest, ytest)

-0.87710300088409288

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
plt.rcParams['figure.figsize'] = (4, 4)
xroc, yroc, _ = roc_curve(ytest, clf.predict_proba(Xtest)[:,1])
plt.plot(xroc, yroc)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()
auc(xroc,yroc)

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
xprc, yprc, _ = precision_recall_curve(ytest, clf.predict_proba(Xtest)[:,1])
plt.plot(xprc, yprc)
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.show()
average_precision_score(ytest, clf.predict_proba(Xtest)[:,1])

In [None]:
from sklearn.learning_curve import learning_curve
train_size, train_score, test_score = learning_curve(
    clf, 
    X, 
    y, 
    n_jobs = -1,
    train_sizes = np.hstack((np.arange(200, 6000, 200), 5958))
)
plt.plot(
    train_size, 
    train_score.mean(axis = 1), 
    label='training'
)
plt.plot(
    train_size, 
    test_score.mean(axis = 1), 
    label='testing', c='red'
)
plt.legend(loc = 0)
plt.ylim([0.5,1])
plt.xlabel('Number of training examples')
plt.ylabel('Scores')
plt.show()

### List of feature importances, and a new classifier using only the most important 10 features:

In [None]:
featurelist = pd.DataFrame()
featurelist['name'] = X.columns
featurelist['importance'] = clf.feature_importances_
featurelist.sort_values('importance', ascending = False)

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X[featurelist.sort_values('importance', ascending = False).head(10).name], 
    y, train_size=.8, stratify = y
)

gs = GridSearchCV(
        estimator = RandomForestClassifier(n_estimators = 200),
        param_grid = {
            'min_samples_leaf' : np.arange(1, 12, 1),
            'max_features' : np.arange(1, 11, 1)
        },
        n_jobs = -1
)
gs.fit(Xtrain, ytrain)
clf = gs.best_estimator_
gs.best_params_

In [None]:
clf.score(Xtrain, ytrain), clf.score(Xtest, ytest)