In [1]:
!python --version

Python 2.7.14


# ML Code Walkthrough

In [2]:
from datetime import date
from preprocess_raw import *
from preprocess_transformed import *
from model_setup_fit import *
from model_optimization import *
import warnings
warnings.filterwarnings("ignore")
pd.get_option("display.max_rows",999)
pd.get_option("display.max_columns",999)
pd.get_option("display.width",None)

GITHUB = os.environ.get("GITHUB")
sys.path.insert(0, GITHUB + 'Machine_Cleaning_JAM/modeling/src')

In [3]:
import time

def print_time(start, end):
    t_sec = round(end - start)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

### Pull in features from ESH Postgres databases

In [4]:
!pwd

/Users/sierra/Documents/ESH/ficher/Machine_Cleaning_JAM/src


In [5]:
raw_data = get_table_from_db('get_data_2019_train.sql', 'file', 
                                     HOST=HOST_FORKED, 
                                     USER=USER_FORKED, 
                                     PASSWORD=PASSWORD_FORKED,
                                     DB=DB_FORKED)
print "raw_data data shape: ", raw_data.shape
print("\n")

Querying data from DB connection
Trying to establish initial connection to the server
Success!
Finished querying data
raw_data data shape:  (47165, 118)




## 1. Preprocessing

In [6]:
start = time.time()
print("*** STARTING PRE-PROCESSING FOR RAW DATA ***")
raw_preprocess = PreprocessRaw(raw_data, verbose=True, corr_threshold=0.93) #setting a higher threshold for dropping variables due to high correlation with others
raw_preprocessed = raw_preprocess.applyall_raw()

training_data_raw = raw_preprocessed.getdata()
print("Shape training_data_raw: ", training_data_raw.shape)

end = time.time()
print("\n")
print_time(start, end)

*** STARTING PRE-PROCESSING FOR RAW DATA ***
Dropped 0 duplicate rows
Dropped null columns: 
['actual_start_date', 'fiber_type', 'fiber_sub_type', 'total_project_plant_route_feet', 'average_cost_per_foot_of_outside_plant', 'total_strands', 'number_of_erate_eligible_strands', 'match_amount', 'source_of_matching_funds', 'total_amount_financed', 'total_number_of_terms_in_months', 'annual_interest_rate', 'baloon_payment', 'special_construction_state_tribal_match_percentage', 'pending_reason', 'fcc_form486', 'fcc_form486_case_status', 'fcc_form486_invoicing_ready', 'last_date_to_invoice', 'wave_sequence_number', 'fcdl_letter_date', 'user_generated_fcdl_date', 'fcdl_comment_app', 'fcdl_comment_frn', 'appeal_wave_number', 'revised_fcdl_date', 'invoicing_mode', 'total_authorized_disbursement', 'connection_used_by', 'make', 'model', 'other_manufacture', 'unit']
months_of_service duplicate with total_number_of_months_of_service, dropping months_of_service
lease_or_non_purchase_agreement duplicat

In [7]:
# you can save it as csv or pickle it if you don't want to keep reo-doing preprocessing
training_data_raw.to_csv('../data/training_data_raw_qa.csv', index=False)

### variables I kept purposefully, could use more research/exploration for how or if to use!! but dropping for the demo

In [8]:
string_vars = ['service_provider_name', 'narrative', 'funding_request_nickname', 'contact_email' ]
training_data_raw.drop(string_vars, axis=1, inplace=True)
print("Final shape training_data_raw: ", training_data_raw.shape)

('Final shape training_data_raw: ', (47165, 106))


## 2. Modeling

### 'Model' class: under the hood
<img src="http://scikit-learn.org/stable/_static/scikit-learn-logo-small.png" alt="Logo">

Built on top of the `sklearn` package, a popular Python Machine Learning package (_not_ particularly geared toward Deep Learning, but a popular first start for regression, tree and ensemble methods).
Initializes a model to fit to given training data and y variable, and fit a model using  GridSearch and/or RandomizedSearch cross validation. 

Input attributes: 
* **training_data**: the training dataset without the (clean) y variable
* **yvar**: column name of y variable to predict
* **model**: the type of model algorithm to fit. Options are one of: ['logisticregression', 'decisiontree', 'randomforest', 'gradientboosting'], but can add more under the hood if desired
* **classification_type**: 'classification' or 'regression'
* **imputer_strategy**: how to handle missing values – must be an array. 
* **p**: proportion of the training dataset to set for training; number between 0 and 1 (optional, default = 0.8)

In [10]:
## Initialize Model object
model_initial = Model(training_data = training_data_raw, 
                      yvar = 'purpose', 
                      model = 'randomforest', 
                      classification_type = 'classification',
                      imputer_strategy = ["mean","median","most_frequent"])

#### build_pipe()
* Input: arrays of hyperparameters to test. For info on where to start, see our [wiki pages](https://educationsuperhighway.atlassian.net/wiki/spaces/SA/pages/541392967/Tuning+ML+Parameters). Must use syntax according to `sklearn` documentation. 
* Splits the data into training set and test (validation) set, and sets up the model pipeline in the format `sklearn` accepts

_Further detail below:_
* `training_setup()`: Obtain and merge the variable to predict, and encode the variable (necessary for classification). In Machine Cleaning case, our y variable came from a different database, hence the need for the two functions below. Could be simplified for other projects.
    * `get_clean_y_query()`: query written to obtain the variable to predict (design varies by project)
    * `merge_y_variable()`: execute the query and merge the y variable to the training data
* `get_train_test_split()`:
Splits the training data by a training proportion **p** into a training set and testing set at random (default 80% train, 20% test)

In [11]:
## SET ARRAYS OF INITIAL HYPERPARAMATER VALUES TO TRY
## number of trees in the random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=500, num=3)]
print "Number of Estimators values to try: " + str(n_estimators)
## number of features to consider at each split
max_features = ['auto', 'sqrt']
## maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start=10, stop=50, num=3)]
max_depth.append(None)
print "Maximum Depth values to try: " + str(max_depth)
## minimum number of samples required to split a node
min_samples_split = [2,5,10]
## minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4]
## method of selecting samples for training each tree
bootstrap = [True, False]

Number of Estimators values to try: [200, 350, 500]
Maximum Depth values to try: [10, 30, 50, None]


In [12]:
start = time.time()

## Pipeline setup
model_initial = model_initial.build_pipe(n_estimators=n_estimators, 
                                         max_depth=max_depth, 
                                         class_weight=['balanced'], 
                                         max_features=max_features, 
                                         n_jobs=[-1], oob_score=[False], 
                                         criterion=['gini', 'entropy'], 
                                         min_samples_split=min_samples_split, 
                                         min_samples_leaf=min_samples_leaf, 
                                         bootstrap=bootstrap)

end = time.time()
print("\n")
print_time(start, end)

Querying data from DB connection
Trying to establish initial connection to the server
Success!
Finished querying data
Category counts for purpose(pre-encoding):
[['backbone' 185]
 ['internet' 11215]
 ['isp' 1204]
 ['upstream' 3195]
 ['wan' 8459]]


Time passed: 0.0hour:0.0min:3.0sec


#### randomized_fit() and fit()
Inputs below. Fit all the models to the full training set using cross-validation, and output the one with the best set of hyperparameters. 
* **n_jobs**: number of jobs (roughly: cores to allocate)
* **verbose**: printing detail level
* **scoring**: metric to optimize for (typically 'accuracy')
* **n_iter**: number of iterations (`randomized_fit()` only)

### Randomized Fit for testing out initial hyperparameters

In [13]:
start = time.time()
print("*** STARTING RANDOMIZED SEARCH ***")
## Find the best model parameters
## JUST FOR DEMO PURPOSES - USUALLY 50+ ITERATIONS ARE RECOMMENDED
model_initial.randomized_fit(n_jobs=4, verbose=3, scoring= 'accuracy', n_iter=50)

end = time.time()
print("\n")
print_time(start, end)

*** STARTING RANDOMIZED SEARCH ***
Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  8.9min
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed: 10.8min finished


Finished fit for all iterations.
BEST PARAMETERS: {'estimator__min_samples_leaf': 1, 'estimator__oob_score': False, 'estimator__criterion': 'gini', 'imputer__strategy': 'most_frequent', 'estimator__min_samples_split': 5, 'estimator__max_features': 'auto', 'estimator__class_weight': 'balanced', 'estimator__bootstrap': False, 'estimator__max_depth': None, 'estimator__n_estimators': 350, 'estimator__n_jobs': -1}


Time passed: 0.0hour:10.0min:54.0sec


### Narrowed down model

In [17]:
start = time.time()
model = Model(training_data_raw, 'purpose', 'randomforest', 'classification', imputer_strategy=['most_frequent'])
model = model.build_pipe(n_estimators = [350], 
                         criterion = ['gini'], 
                         min_samples_leaf=[1], 
                         min_samples_split =[5], 
                         max_depth = [30], 
                         class_weight = ['balanced'], 
                         max_features = ['auto'],
                         n_jobs = [-1], 
                         oob_score = [False],
                         bootstrap = [False])

end = time.time()
print("\n")
print_time(start, end)

Querying data from DB connection
Trying to establish initial connection to the server
Success!
Finished querying data
Category counts for purpose(pre-encoding):
[['backbone' 185]
 ['internet' 11215]
 ['isp' 1204]
 ['upstream' 3195]
 ['wan' 8459]]


Time passed: 0.0hour:0.0min:3.0sec


In [18]:
print("*** STARTING GRIDSEARCH ***")
## Find the best model parameters
model.fit()

*** STARTING GRIDSEARCH ***
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] estimator__min_samples_leaf=1, estimator__oob_score=False, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=5, estimator__max_features=auto, imputer__strategy=most_frequent, estimator__bootstrap=False, estimator__max_depth=30, estimator__n_estimators=350, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  estimator__min_samples_leaf=1, estimator__oob_score=False, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=5, estimator__max_features=auto, imputer__strategy=most_frequent, estimator__bootstrap=False, estimator__max_depth=30, estimator__n_estimators=350, estimator__n_jobs=-1, score=0.921638330757, total=   6.0s
[CV] estimator__min_samples_leaf=1, estimator__oob_score=False, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=5, estimator__max_features=auto, imputer__strategy=most_frequent, estimator__bootstrap=False, estimator__max_depth=30, estimator__n_estimators=350, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.6s remaining:    0.0s


[CV]  estimator__min_samples_leaf=1, estimator__oob_score=False, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=5, estimator__max_features=auto, imputer__strategy=most_frequent, estimator__bootstrap=False, estimator__max_depth=30, estimator__n_estimators=350, estimator__n_jobs=-1, score=0.924884080371, total=   5.9s
[CV] estimator__min_samples_leaf=1, estimator__oob_score=False, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=5, estimator__max_features=auto, imputer__strategy=most_frequent, estimator__bootstrap=False, estimator__max_depth=30, estimator__n_estimators=350, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.0s remaining:    0.0s


[CV]  estimator__min_samples_leaf=1, estimator__oob_score=False, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=5, estimator__max_features=auto, imputer__strategy=most_frequent, estimator__bootstrap=False, estimator__max_depth=30, estimator__n_estimators=350, estimator__n_jobs=-1, score=0.9189607176, total=   6.0s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.6s finished


                                              Feature  Importance
0                        purpose_data_connect_2ormore    0.090380
1                     purpose_ias_includes_connection    0.087798
2                            purpose_data_connect_hub    0.083043
3                                    purpose_backbone    0.077329
4                              purpose_ias_no_circuit    0.070189
5                                   billed_consortium    0.048355
6         connected_directly_to_school_library_or_nif    0.037990
7                          total_pre_discount_charges    0.037272
8               monthly_recurring_unit_eligible_costs    0.036854
9                              billed_school_district    0.033941
10             total_monthly_eligible_recurring_costs    0.032313
11          connection_supports_school_library_or_nif    0.032140
12                                download_speed_mbps    0.032065
13                                  upload_speed_mbps    0.031498
14        

## 3. Feature Elimination

### 'ModelOptimizer' class: under the hood
* Also built on top of the `sklearn` package. 
* Serves as a starting point for eliminating features to reduce overfitting, since we started with ~300 features with limited intuition for which ones to select.
* Helps with (small) incremental gains in _testing set score_. Good to choose a threshold right before the accuracy/score starts dropping more dramatically.
* There are many complex methods and packages for feature elimination, but we wanted to start with the most rudimentary one possible.
* Modeled after [this post.](https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/) 

Input attributes: 
* **model_fit_obj**: An instance of the `Model` class _that has been fit_, mandatory.
* **strategy**: Feature elimination testing strategy, mandatory. Options are one of: 'importance','manual','both'
* **drop_features**: A list of column names of feature(s) to test elimination of, optional.
* **threshold**: A decimal, optional.

### optimize() pseudocode:
<img src="strategy_desc.png">


In [19]:
# By importance (all) for sake of demo; it's what we did the majority of the time
start = time.time()
model_optimizer = ModelOptimizer(model, 'importance')
print("*** STARTING FEATURE ELIMINATION/OPTIMIZATION ***")
model_optimizer.optimize()

end = time.time()
print("\n")
print_time(start, end)

*** STARTING FEATURE ELIMINATION/OPTIMIZATION ***
Thresh=0.00000730, n=98, Accuracy: 92.48%, Precision: 92.47%
Thresh=0.00001196, n=97, Accuracy: 92.31%, Precision: 92.31%
Thresh=0.00005288, n=96, Accuracy: 92.31%, Precision: 92.31%
Thresh=0.00007868, n=95, Accuracy: 92.37%, Precision: 92.37%
Thresh=0.00008460, n=94, Accuracy: 92.35%, Precision: 92.35%
Thresh=0.00010081, n=93, Accuracy: 92.37%, Precision: 92.37%
Thresh=0.00010612, n=92, Accuracy: 92.39%, Precision: 92.39%
Thresh=0.00011919, n=91, Accuracy: 92.35%, Precision: 92.34%
Thresh=0.00012469, n=90, Accuracy: 92.35%, Precision: 92.36%
Thresh=0.00012920, n=89, Accuracy: 92.27%, Precision: 92.27%
Thresh=0.00013587, n=88, Accuracy: 92.35%, Precision: 92.34%
Thresh=0.00014311, n=87, Accuracy: 92.39%, Precision: 92.40%
Thresh=0.00015842, n=86, Accuracy: 92.42%, Precision: 92.42%
Thresh=0.00016798, n=85, Accuracy: 92.35%, Precision: 92.36%
Thresh=0.00018131, n=84, Accuracy: 92.35%, Precision: 92.36%
Thresh=0.00022192, n=83, Accuracy: 

### Apply final importance threshold

In [20]:
mo_final = ModelOptimizer(model, 'importance', threshold=0.00115)
mo_final.optimize()

Thresh=0.00115000, n=54, Accuracy: 92.44%, Precision: 92.46%
('Maximum Accuracy: ', '0.924361088211047')


## 4. Finally, output results of final model
**`output_results()`** function would be unique to your project: 
* extracts elements from a fit and/or optimized `Model` or `ModelOptimizer` object and outputs them to a database 
* stores final model object and features list (via `cPickle`).

In [21]:
output_results(mo_final, mo_final.getfeatures(),'Sierra',comment='rerun demo model')

Querying data from DB connection
Trying to establish initial connection to the server
Success!
Finished querying data
Results inserted
All results inserted


In [None]:
# stopped here

In [19]:
mo_final.getfeatures()

Index([u'wide_area_network_text_narrative', u'was_fcc_form470_posted',
       u'wan_text_narrative', u'wan_text_funding_request_nickname',
       u'usac_school_district', u'usac_consortium',
       u'transport_text_narrative', u'transport_text_funding_request_nickname',
       u'transmission_text_narrative', u'total_remaining_contract_length',
       ...
       u'connected_directly_to_school_library_or_nif', u'connect_lit_fiber',
       u'connect_isp_only', u'connect_ethernet', u'connect_dark_fiber',
       u'basic_firewall_protection', u'based_on_state_master_contract',
       u'bandwidth_in_mbps', u'backbone_text_narrative', u'app_new_sp'],
      dtype='object', length=120)

In [None]:
mo_final.bestmodel