In [2]:
import csv
import pandas as pd
import seaborn as sns

df = pd.read_csv('spell_clean.csv')

In [3]:
good_val = 'GOOD'
bad_val = 'BAD'
unknown_val = 'UNKNOWN'
exit_mapping = {
    'XRF':good_val, 'XLC':good_val, 'XRL':good_val, 'XCA':good_val,
    'XOT':bad_val, 'XOP':bad_val, 'XRM':bad_val, 'XRY':bad_val, 'XJP':bad_val,
    'ZTC':unknown_val, 'XUK':unknown_val
}
outcomes = [exit_mapping[x] for x in df.EXIT if x in exit_mapping]
sns.countplot(outcomes)
df['outcome'] = outcomes

In [4]:
hispanic_dummies = pd.get_dummies(df.HISPANIC)

In [5]:
df = pd.concat([df,hispanic_dummies], axis = 1)

In [6]:
X = df[['STARTAGE', 'SPELLAGE', 'N', 'U', 'Y', 'NPLACES', 'EXITAGE']]

In [7]:
y = df.outcome

In [13]:
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
def run_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print 'Accuracy', accuracy_score(y_test, y_pred)
    print 'Cross Val Score', cross_val_score(model, X, y)
    print 'Confusion Matrix','\n',  confusion_matrix(y_test, y_pred)
    print 'Classification Report','\n', classification_report(y_test, y_pred)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf = RandomForestClassifier()
run_model(rf)

Accuracy 0.713215442765
Cross Val Score [ 0.70392306  0.66828651  0.72956007]
Confusion Matrix 
[[2653 2194   38]
 [1424 7904   43]
 [ 167  383   10]]
Classification Report 
             precision    recall  f1-score   support

        BAD       0.63      0.54      0.58      4885
       GOOD       0.75      0.84      0.80      9371
    UNKNOWN       0.11      0.02      0.03       560

avg / total       0.69      0.71      0.70     14816



In [11]:
rf.feature_importances_

array([ 0.21205316,  0.23874658,  0.01015813,  0.00520434,  0.00148331,
        0.11857278,  0.4137817 ])

In [16]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(rf)
run_model(ada)

Accuracy 0.714565334773
Cross Val Score [ 0.70210073  0.66707163  0.72485192]
Confusion Matrix 
[[2659 2189   37]
 [1406 7915   50]
 [ 162  385   13]]
Classification Report 
             precision    recall  f1-score   support

        BAD       0.63      0.54      0.58      4885
       GOOD       0.75      0.84      0.80      9371
    UNKNOWN       0.13      0.02      0.04       560

avg / total       0.69      0.71      0.70     14816



In [17]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
run_model(gbc)

Accuracy 0.728334233261
Cross Val Score [ 0.72877752  0.68230828  0.74925328]
Confusion Matrix 
[[2754 2130    1]
 [1334 8036    1]
 [ 163  396    1]]
Classification Report 
             precision    recall  f1-score   support

        BAD       0.65      0.56      0.60      4885
       GOOD       0.76      0.86      0.81      9371
    UNKNOWN       0.33      0.00      0.00       560

avg / total       0.71      0.73      0.71     14816



In [18]:
from sklearn.ensemble import ExtraTreesClassifier

ext = ExtraTreesClassifier()
run_model(ext)

Accuracy 0.711528077754
Cross Val Score [ 0.7027588   0.66281954  0.72996507]
Confusion Matrix 
[[2721 2137   27]
 [1518 7812   41]
 [ 176  375    9]]
Classification Report 
             precision    recall  f1-score   support

        BAD       0.62      0.56      0.59      4885
       GOOD       0.76      0.83      0.79      9371
    UNKNOWN       0.12      0.02      0.03       560

avg / total       0.69      0.71      0.70     14816

