## This notebook is the part of Georgetown University Data Science Project - Team Ship Happen


## Purpose of this notebook is Model Selection and Evaluation

### Import required libraries

In [1]:
%matplotlib inline


import time

import numpy as np
import matplotlib.cm as cm

# Standard Python libraries
import os                                    # For accessing operating system functionalities
import json                                  # For encoding and decoding JSON data
import pickle                                # For serializing and de-serializing Python objects

# Libraries that can be pip installed
import requests                              # Simple Python library for HTTP
import pandas as pd                          # Library for building dataframes similar to those in R
import seaborn as sns                        # Statistical visualization library based on Matplotlib
import matplotlib.pyplot as plt  
from sklearn.datasets.base import Bunch

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, auc, roc_curve, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.svm import LinearSVC, NuSVC, SVC

from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import KFold




### Create Bunch 


In [2]:

DATA_DIR = os.path.abspath(os.path.join(".", "..", "Georgetown_Data_Science_Project"))

# Show the contents of the data directory
for name in os.listdir(DATA_DIR):
    if name.startswith("."): continue
    print("- {}".format(name))

- data
- feature_selection.ipynb
- incident_knn-classifier.pickle
- incident_random-forest-classifier.pickle
- ingetion_wranling.ipynb
- meta_incident.json
- model evaluation.txt
- model_selection.ipynb
- model_selection_categorical.ipynb
- mvinjury.txt
- mvinjury_data.txt
- mvinjury_data_final.txt
- predicton comparision.xlsx
- ReadMe.md
- results_user_input_data_random-forest-classifier.txt
- user_input_data.txt


In [3]:
def load_data(root=DATA_DIR):
    # Construct the `Bunch` for the Misle incident dataset
    filenames     = {
        'meta': os.path.join(root, 'meta_incident.json'),
        'rdme': os.path.join(root, 'ReadMe.md'),        
        'data': os.path.join(root, 'mvinjury_data_final.txt')        
    }

    # Load the meta data from the meta json
    with open(filenames['meta'], 'r') as f:
        meta = json.load(f)
        target_names  = meta['target_names']
        feature_names = meta['feature_names']

    # Load the description from the README. 
    with open(filenames['rdme'], 'r') as f:
        DESCR = f.read()

    # Load the dataset from the text file.
    mydataset = np.loadtxt(filenames['data'])

    # Extract the target from the data
    data   = mydataset[:, 0:-1]
    target = mydataset[:, -1]

    # Create the bunch object
    return Bunch(
        data=data,
        target=target,
        filenames=filenames,
        target_names=target_names,
        feature_names=feature_names,
        DESCR=DESCR
    )

# Save the dataset as a variable we can use.
mydataset = load_data()

print(mydataset.data.shape)
print(mydataset.target.shape)

(260364, 6)
(260364,)


In [5]:
def fit_and_evaluate(dataset, model, label, **kwargs):
    """
    Because of the Scikit-Learn API, we can create a function to
    do all of the fit and evaluate work on our behalf!
    """
    start  = time.time() # Start the clock! 
    scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
    
    for train, test in KFold(mydataset.data.shape[0], n_folds=12, shuffle=True):
        X_train, X_test = mydataset.data[train], mydataset.data[test]
        y_train, y_test = mydataset.target[train], mydataset.target[test]
        
        estimator = model(**kwargs)
        estimator.fit(X_train, y_train)
        
        expected  = y_test
        predicted = estimator.predict(X_test)
        
        # Append our scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))

    # Report
    print("Build and Validation of {} took {:0.3f} seconds".format(label, time.time()-start))
    print("Validation scores are as follows:\n")
    print(pd.DataFrame(scores).mean())
    
    # Write official estimator to disk
    estimator = model(**kwargs)
    estimator.fit(mydataset.data, mydataset.target)
    
    outpath = label.lower().replace(" ", "-") + ".pickle"
    with open(outpath, 'wb') as f:
        pickle.dump(estimator, f)

    print("\nFitted model written to:\n{}".format(os.path.abspath(outpath)))

In [None]:
# Perform SVC Classification
fit_and_evaluate(mydataset, SVC, "Incident_SVM Classifier")

In [6]:
# Perform kNN Classification
fit_and_evaluate(mydataset, KNeighborsClassifier, "Incident_KNN Classifier", n_neighbors=12)

Build and Validation of Incident_KNN Classifier took 24.121 seconds
Validation scores are as follows:

accuracy     0.984910
f1           0.977680
precision    0.972103
recall       0.984910
dtype: float64

Fitted model written to:
C:\project\Georgetown_Data_Science_Project\incident_knn-classifier.pickle


In [7]:
# Perform Random Forest Classification
fit_and_evaluate(mydataset, RandomForestClassifier, "Incident_Random Forest Classifier")

Build and Validation of Incident_Random Forest Classifier took 42.976 seconds
Validation scores are as follows:

accuracy     0.981422
f1           0.977043
precision    0.973395
recall       0.981422
dtype: float64

Fitted model written to:
C:\project\Georgetown_Data_Science_Project\incident_random-forest-classifier.pickle


In [8]:
#Take user data from text file to predict accident (yes/no), using random forest classifier model
import csv

def load_model(path='incident_random-forest-classifier.pickle'):
    with open(path, 'rb') as f:
        return pickle.load(f)

model = load_model()

# Create a reader for the text file and a write to write output 
with open('user_input_data.txt', 'r') as fin:
    reader = csv.reader(fin, delimiter='\t') 

    # Create writer to write CSV output 
    with open('results_user_input_data_random-forest-classifier.txt', 'w') as fout:
        writer = csv.writer(fout) 

        # Go through all your data and run the predictions, writing to the results
        for idx, row in enumerate(reader):
            accident = model.predict([row]) 
            writer.writerow([idx+1,row[0], accident])