Code to evaluate synthetic data on machine learning w/ random forests. Sage Hahn

In [1]:
import numpy as np
from config import config
from utils import load_data, flip, convert
from sklearn.model_selection import train_test_split
from structure_learn import split_data
from scipy.stats import randint as sp_randint

import sklearn, pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from time import time

In [2]:
data, names, encoders = load_data()
data = np.swapaxes(data,0,1)

train, test = train_test_split(data, test_size=config['test_size'], random_state=config['ran_state'])

train = np.swapaxes(train, 0, 1)
test = np.swapaxes(test, 0, 1)

print(np.shape(train))
print(np.shape(test))

(27, 244106)
(27, 61027)


In [3]:
with open('generated.pkl', 'rb') as f:
    train = pickle.load(f)
    train = np.swapaxes(train, 0, 1)

In [4]:
result_dict = {}

In [5]:
clf1 = RandomForestClassifier(n_estimators=50)

param_dist = {"n_estimators": [5, 30],
              "max_depth": [3, None],
              "max_features": [1, 11],
              "min_samples_split": [2, 11],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(clf1, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=3, n_jobs=-1)
    

for ind in range(len(train)):    
    x_train, y_train = split_data(train, ind)
    x_test, y_test = split_data(test, ind)

    print('predicting', names[ind])

    start = time()
    random_search.fit(x_train, y_train)

    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    
    best = random_search.best_estimator_
    test_score1 = best.score(x_test, y_test)
    
    print(names[ind], ':', test_score1)

    
    result_dict[names[ind]] = test_score1
    

predicting ALS Unit
RandomizedSearchCV took 23.40 seconds for 30 candidates parameter settings.
ALS Unit : 0.9053697543710161
predicting Final Priority
RandomizedSearchCV took 9.24 seconds for 30 candidates parameter settings.
Final Priority : 0.9995411866878595
predicting Call Type Group




RandomizedSearchCV took 23.22 seconds for 30 candidates parameter settings.
Call Type Group : 0.8688613236764056
predicting Original Priority




RandomizedSearchCV took 24.80 seconds for 30 candidates parameter settings.
Original Priority : 0.924033624461304
predicting Priority




RandomizedSearchCV took 19.88 seconds for 30 candidates parameter settings.
Priority : 0.98138528847887
predicting City
RandomizedSearchCV took 21.81 seconds for 30 candidates parameter settings.
City : 0.9858095597030823
predicting Unit Type




RandomizedSearchCV took 22.51 seconds for 30 candidates parameter settings.
Unit Type : 0.8476084356104675
predicting Fire Prevention District




RandomizedSearchCV took 24.75 seconds for 30 candidates parameter settings.
Fire Prevention District : 0.9247710029986728
predicting Battalion




RandomizedSearchCV took 25.54 seconds for 30 candidates parameter settings.
Battalion : 0.9135464630409491
predicting Supervisor District




RandomizedSearchCV took 24.22 seconds for 30 candidates parameter settings.
Supervisor District : 0.858128369410261
predicting Call Final Disposition




RandomizedSearchCV took 39.05 seconds for 30 candidates parameter settings.
Call Final Disposition : 0.7416225605060055
predicting Zipcode of Incident




RandomizedSearchCV took 27.44 seconds for 30 candidates parameter settings.
Zipcode of Incident : 0.8415127730348863
predicting Call Type




RandomizedSearchCV took 26.25 seconds for 30 candidates parameter settings.
Call Type : 0.8295180821603553
predicting Neighborhooods - Analysis Boundaries




RandomizedSearchCV took 28.00 seconds for 30 candidates parameter settings.
Neighborhooods - Analysis Boundaries : 0.7021154570927622
predicting Station Area




RandomizedSearchCV took 30.45 seconds for 30 candidates parameter settings.
Station Area : 0.5931472954593868
predicting Watch Date




RandomizedSearchCV took 22.66 seconds for 30 candidates parameter settings.
Watch Date : 0.9858914906516788
predicting Received DtTm




RandomizedSearchCV took 16.05 seconds for 30 candidates parameter settings.
Received DtTm : 1.0
predicting Entry DtTm




RandomizedSearchCV took 15.92 seconds for 30 candidates parameter settings.
Entry DtTm : 0.9999836138102807
predicting Dispatch DtTm
RandomizedSearchCV took 18.19 seconds for 30 candidates parameter settings.
Dispatch DtTm : 0.9999836138102807
predicting Location - Lng




RandomizedSearchCV took 31.82 seconds for 30 candidates parameter settings.
Location - Lng : 0.4619758467563537
predicting Number of Alarms




RandomizedSearchCV took 13.79 seconds for 30 candidates parameter settings.
Number of Alarms : 0.9984105395972275
predicting Unit sequence in call dispatch




RandomizedSearchCV took 17.51 seconds for 30 candidates parameter settings.
Unit sequence in call dispatch : 0.956314418208334
predicting Location - Lat




RandomizedSearchCV took 23.39 seconds for 30 candidates parameter settings.
Location - Lat : 0.5108722368787586
predicting Call Date




RandomizedSearchCV took 19.56 seconds for 30 candidates parameter settings.
Call Date : 0.9814508332377472
predicting Unit ID




RandomizedSearchCV took 34.37 seconds for 30 candidates parameter settings.
Unit ID : 0.3309518737607944
predicting Box




RandomizedSearchCV took 40.76 seconds for 30 candidates parameter settings.
Box : 0.16048634211086896
predicting Address




RandomizedSearchCV took 36.15 seconds for 30 candidates parameter settings.
Address : 0.40747537974994674


I will admit that I saved and processed the results from these runs in a terrible way

In [6]:
np.save('r5', result_dict)
result_dict

{'ALS Unit': 0.9053697543710161,
 'Address': 0.40747537974994674,
 'Battalion': 0.9135464630409491,
 'Box': 0.16048634211086896,
 'Call Date': 0.9814508332377472,
 'Call Final Disposition': 0.7416225605060055,
 'Call Type': 0.8295180821603553,
 'Call Type Group': 0.8688613236764056,
 'City': 0.9858095597030823,
 'Dispatch DtTm': 0.9999836138102807,
 'Entry DtTm': 0.9999836138102807,
 'Final Priority': 0.9995411866878595,
 'Fire Prevention District': 0.9247710029986728,
 'Location - Lat': 0.5108722368787586,
 'Location - Lng': 0.4619758467563537,
 'Neighborhooods - Analysis Boundaries': 0.7021154570927622,
 'Number of Alarms': 0.9984105395972275,
 'Original Priority': 0.924033624461304,
 'Priority': 0.98138528847887,
 'Received DtTm': 1.0,
 'Station Area': 0.5931472954593868,
 'Supervisor District': 0.858128369410261,
 'Unit ID': 0.3309518737607944,
 'Unit Type': 0.8476084356104675,
 'Unit sequence in call dispatch': 0.956314418208334,
 'Watch Date': 0.9858914906516788,
 'Zipcode of Inc

In [7]:
#np.save('r3', result_dict)
result_dict

{'ALS Unit': 0.9240500106510233,
 'Address': 0.41170301669752735,
 'Battalion': 0.9078440690186311,
 'Box': 0.16446818621265996,
 'Call Date': 0.9814836056171858,
 'Call Final Disposition': 0.7204188310092254,
 'Call Type': 0.8286823864846707,
 'Call Type Group': 0.8436593638881151,
 'City': 0.9843511888180642,
 'Dispatch DtTm': 0.9999180690514035,
 'Entry DtTm': 0.9759614596817802,
 'Final Priority': 0.9999672276205613,
 'Fire Prevention District': 0.9206252969996886,
 'Location - Lat': 0.4862274075409245,
 'Location - Lng': 0.4198469529880217,
 'Neighborhooods - Analysis Boundaries': 0.7377062611630918,
 'Number of Alarms': 0.9981811329411572,
 'Original Priority': 0.9440903206777328,
 'Priority': 0.9815655365657824,
 'Received DtTm': 0.9911350713618562,
 'Station Area': 0.5915906074360529,
 'Supervisor District': 0.8332213610369181,
 'Unit ID': 0.36941026103200225,
 'Unit Type': 0.8417749520703951,
 'Unit sequence in call dispatch': 0.9287200747210251,
 'Watch Date': 0.9858914906516

In [25]:
#np.save('r2', result_dict)
result_dict

{'ALS Unit': 0.92436134825569,
 'Address': 0.4116374719386501,
 'Battalion': 0.905582774837367,
 'Box': 0.23437167155521327,
 'Call Date': 0.9740606616743409,
 'Call Final Disposition': 0.7416225605060055,
 'Call Type': 0.8267324299080735,
 'Call Type Group': 0.7048519507758861,
 'City': 0.9434184868992413,
 'Dispatch DtTm': 0.9999344552411228,
 'Entry DtTm': 0.9974601405935077,
 'Final Priority': 0.9995903452570174,
 'Fire Prevention District': 0.9240500106510233,
 'Location - Lat': 0.24653022432693725,
 'Location - Lng': 0.3814213380962525,
 'Neighborhooods - Analysis Boundaries': 0.740033100103233,
 'Number of Alarms': 0.8992740917954348,
 'Original Priority': 0.9445327478001541,
 'Priority': 0.9810247923050454,
 'Received DtTm': 0.9999836138102807,
 'Station Area': 0.4942074819342258,
 'Supervisor District': 0.8041194880954332,
 'Unit ID': 0.3811919314401822,
 'Unit Type': 0.8414308420862897,
 'Unit sequence in call dispatch': 0.8492470545823979,
 'Watch Date': 0.9853343602012224,


In [17]:
#np.save('r1', result_dict)
result_dict

{'ALS Unit': 0.9243121896865322,
 'Address': 0.40221541285004997,
 'Battalion': 0.8652727481278778,
 'Box': 0.25324856211185215,
 'Call Date': 0.9814836056171858,
 'Call Final Disposition': 0.7036229865469382,
 'Call Type': 0.8243892047782129,
 'Call Type Group': 0.7920264800825864,
 'City': 0.9858095597030823,
 'Dispatch DtTm': 0.9999836138102807,
 'Entry DtTm': 1.0,
 'Final Priority': 0.9983449948383503,
 'Fire Prevention District': 0.9234437216314091,
 'Location - Lat': 0.4792960492896587,
 'Location - Lng': 0.4730529110066036,
 'Neighborhooods - Analysis Boundaries': 0.692791715142478,
 'Number of Alarms': 0.9984105395972275,
 'Original Priority': 0.9435659626067151,
 'Priority': 0.9813361299097121,
 'Received DtTm': 0.9999836138102807,
 'Station Area': 0.5651105248496567,
 'Supervisor District': 0.7961394137021318,
 'Unit ID': 0.35197535517066214,
 'Unit Type': 0.841398069706851,
 'Unit sequence in call dispatch': 0.8553918757271372,
 'Watch Date': 0.9858751044619595,
 'Zipcode of

In [11]:
result_dict

{'ALS Unit': 0.9516607403280515,
 'Address': 0.7512576400609566,
 'Battalion': 0.9819096465498878,
 'Box': 0.8256673275763187,
 'Call Date': 0.9874154062955741,
 'Call Final Disposition': 0.7901912268340243,
 'Call Type': 0.9022727645140676,
 'Call Type Group': 0.892129713077818,
 'City': 0.9990496009962804,
 'Dispatch DtTm': 0.9999836138102807,
 'Entry DtTm': 1.0,
 'Final Priority': 1.0,
 'Fire Prevention District': 0.9840398512133973,
 'Location - Lat': 0.980418503285431,
 'Location - Lng': 0.9686696052566897,
 'Neighborhooods - Analysis Boundaries': 0.9837285136087306,
 'Number of Alarms': 0.9999672276205613,
 'Original Priority': 0.9733560555164108,
 'Priority': 0.9853179740115031,
 'Received DtTm': 1.0,
 'Station Area': 0.9720123879594278,
 'Supervisor District': 0.9887754600422763,
 'Unit ID': 0.6220197617448014,
 'Unit Type': 0.9251314991724974,
 'Unit sequence in call dispatch': 0.9597882904288266,
 'Watch Date': 0.9907909613777508,
 'Zipcode of Incident': 0.9812050403919577}