# Using symptoms for prediction

In [1]:


import pandas as pd
import numpy as np
import sklearn
import re
import string
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction import text
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
# import service request csv
srdf = pd.read_csv("analytics/service_requests_Train-test_set.csv")

## Preprocess Text Functions

In [3]:
# clean text

class Preprocess:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.remove_list = stopwords.words('english')
        #self.remove_list.remove('no')
        self.remove_list += ['please', 'thank', 'pacific', 'canada',
        'none', 'florida', 'cst', 'tue', 'mon', 'wed', 'thu', 'fri', 'sat', 'monday', 'tuesday', 'wednesday', 'thursday','friday', 'saturday', 'sunday',
        'sun', 'cd', 'gmt', 'hw', 'sw', 'sg', 'error', 'still', 'need', 'call', 'service','customer', 'yesterday', 'today', 'year', 'yet', 'now', 'okay', 'spoke', 'spoken', 'no', 'benjamin', 'matthew', 'susan','jean', 'jason', 'hanelle', 'jan', 'feb', 'march', 'april', 'may','june', 'july', 'august', 'september', 'october', 'november', 'december']
        self.spelling_map = {
            'gantri': 'gantry',
            'gantree': 'gantry',
            'ystem': 'system',
            'patietn': 'patient',
            'patint': 'patient',
            'dispacth': 'dispatch'
        }

    def process_text(self, text):
        comp = re.compile(r'Image:[\S]*')
        text = comp.sub('', text)
        terms = self.tokenize(self.clean_text(text).lower().strip())
        clean_terms = [self.spelling_map.get(term) or term for term in terms if term not in self.remove_list and len(term) > 1]
        stemmed_tokens = list(self.porter_stem(clean_terms))
        # remove weird punctuation and numbers and images
        return ' '.join(stemmed_tokens)

    def clean_text(self, text):
        final_text = ''
        for i in text:
            if i == ' ':
                final_text += i
            elif not i.isalpha():
                final_text += ' '
            else:
                final_text += i
        return final_text

    def tokenize(self, text):
        terms = []
        # tokenize sentences before words
        sentences = nltk.sent_tokenize(text)
        for sent in sentences:
            terms += nltk.word_tokenize(sent)
        return terms

    def porter_stem(self, terms):
        for term in terms:
            yield self.stemmer.stem(term)


## Clean text

In [4]:
preprocess = Preprocess()
clean_srdf = srdf
clean_srdf['symptom'] = srdf['symptom'].apply(preprocess.process_text)


In [5]:
X = clean_srdf.symptom
y = clean_srdf.Resolution_Code
## Use int as labels for Y (Eg turn Perform_software_config into 31)
# y = clean_srdf.Resolution_Code.astype('category').cat.codes

# split X and y into training and testing sets
# from sklearn.model_selection import train_test_split
## Stratify to account for imbalance in dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, stratify=y)

In [6]:
tuned_parameters = [{'loss': ['modified_huber', 'squared_hinge'], 'penalty': ['l2'],
                     'alpha': [1e-3, 1e-4, 1e-5], 'n_iter': [5], 'learning_rate' : ['constant', 'invscaling'], 'eta0': [0.5, 0.1, 0.05], 'average' : [True, 10, 50, 100]}] 

## Function to find the top N accuracy 

In [7]:
def top_n_accuracy(preds, truths, n, model):
    best_n = np.argsort(preds, axis=1)[:,-n:]
    successes = 0
    for i in range(truths.shape[0]):
        if truths.iloc[i] in model.classes_[best_n[i,:]]:
            successes += 1
    return float(successes)/truths.shape[0]

## ML Model (solely using symptoms)

In [44]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
## Try out cross validation 
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    
    # create document-term matrices using the vectorizer
#     X_train_dtm = vect.fit_transform(X_train)
#     X_test_dtm = vect.transform(X_test)
    
    # print the number of features that were generated
#     print('Features: ', X_train_dtm.shape[1])
    
#     Multinomial Naive Bayes (43 percent accuracy)
#     nb = MultinomialNB(alpha=5)
#     nb.fit(X_train_dtm, y_train)
#     y_pred_class = nb.predict(X_test_dtm)
    
# #     use Random Forest (44 percent accuracy)
#     rf = RandomForestClassifier(max_depth=65, n_estimators = 200, random_state=0)
#     rf.fit(X_train_dtm, y_train)
#     y_pred_class = rf.predict(X_test_dtm)

# #     XG Boost (43 percent accuracy)
#     xgb = XGBClassifier()
#     xgb.fit(X_train_dtm, y_train)
#     y_pred_class = xgb.predict(X_test_dtm)

## Used gridsearch to find best parameters for each ml model 
    
#     clf = GridSearchCV(SGDClassifier(), tuned_parameters, cv=5, scoring='accuracy', verbose=10)
#     clf.fit(X_train_dtm, y_train)
#     print(clf.best_params_)
#     y_pred_class = clf.predict(X_test_dtm)
#     print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
    
    
    
    # Cross Validation
    skf = StratifiedKFold(n_splits=5, random_state=None)
    # X is the feature set and y is the target
    for train_index, val_index in skf.split(X,y): 
        print("Train:", train_index, "Validation:", val_index)
        X_train, X_test = X[train_index], X[val_index] 
        y_train, y_test = y[train_index], y[val_index]
        X_train_dtm = vect.fit_transform(X_train)
        ## merge with error codes
        print(X_train_dtm)
        X_test_dtm = vect.transform(X_test)
        
        rf = RandomForestClassifier(max_depth=65, n_estimators = 200, random_state=1)
#         rf.fit(X_train_dtm, y_train)
#         y_pred_class = rf.predict_proba(X_test_dtm)
        
# #         nb = MultinomialNB(alpha=5)
# #         nb.fit(X_train_dtm, y_train)
# #         y_pred_class = nb.predict(X_test_dtm)

# #         xgb = XGBClassifier(eta=0.1)
# #         xgb.fit(X_train_dtm, y_train)
# #         y_pred_class = xgb.predict(X_test_dtm)
        sgd = SGDClassifier(alpha=0.001, average=100, learning_rate='optimal', loss='modified_huber', n_iter=5, penalty='l2')
#         sgd.fit(X_train_dtm, y_train)
#         y_pred_class = sgd.predict_proba(X_test_dtm)
    
#     Ensemble Learning
        ensemble = VotingClassifier(estimators=[('rf', rf), ('sgd', sgd)], voting='soft')
        ensemble.fit(X_train_dtm, y_train)
        y_pred_class = ensemble.predict(X_test_dtm)
        print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
        
        ## Use this if you want to get top N accuracy
        y_pred_class = ensemble.predict_proba(X_test_dtm)
        print('Top 3 Accuracy: ', top_n_accuracy(y_pred_class, y_test, 3, ensemble))
        print('Top 5 Accuracy: ', top_n_accuracy(y_pred_class, y_test, 5, ensemble))

    return ensemble

# tune TfidfVectorizer for better results
# vect = TfidfVectorizer(norm=None, max_features=3000, min_df=0.00008, ngram_range=(1,2), sublinear_tf=True, binary = True)
# vect = TfidfVectorizer(max_features=40000,
#                              min_df=5, 
#                              max_df=0.5, 
#                              analyzer='word')
# vect = TfidfVectorizer(analyzer='word')

# alpha=0.001, average=100, eta0=0.5, learning_rate=invscaling, loss=modified_huber, n_iter=5, penalty=l2 

vect = TfidfVectorizer(analyzer='word', min_df = 5, max_df = 0.5)

ensemble = tokenize_test(vect)



Train: [ 239  400  403 ... 4062 4063 4064] Validation: [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  

  if diff:


Accuracy:  0.49406175771971494
Top 3 Accuracy:  0.6888361045130641
Top 5 Accuracy:  0.7612826603325415
Train: [   0    1    2 ... 4062 4063 4064] Validation: [ 239  400  403  404  405  412  428  434  454  484  490  493  517  518
  525  548  549  550  559  581  585  587  588  598  601  602  603  607
  609  612  621  625  631  634  642  643  650  663  667  669  672  674
  678  680  683  687  690  691  694  702  706  709  717  720  722  726
  728  729  731  733  734  739  745  747  749  750  755  756  759  760
  761  764  768  770  771  772  774  778  782  785  786  788  789  791
  799  806  813  814  815  816  817  818  820  821  822  825  827  828
  829  833  834  835  836  838  842  846  848  849  851  852  855  856
  858  859  861  862  864  865  866  867  869  871  872  873  874  876
  877  878  879  880  881  882  883  884  886  888  889  890  891  896
  897  898  900  901  902  905  909  910  911  912  913  914  915  916
  917  918  920  921  922  923  924  925  928  930  931  932 

  if diff:


Accuracy:  0.528415961305925
Top 3 Accuracy:  0.7291414752116082
Top 5 Accuracy:  0.7859733978234583
Train: [   0    1    2 ... 4062 4063 4064] Validation: [ 657  832  934  984  986 1030 1044 1116 1139 1141 1143 1144 1161 1165
 1174 1176 1177 1187 1188 1196 1208 1217 1218 1219 1222 1232 1234 1239
 1243 1247 1272 1279 1282 1314 1327 1331 1335 1340 1348 1356 1358 1363
 1369 1379 1388 1389 1403 1410 1415 1427 1444 1446 1452 1454 1456 1457
 1462 1470 1471 1473 1477 1478 1480 1492 1499 1504 1505 1506 1526 1532
 1535 1544 1550 1551 1558 1561 1567 1571 1575 1579 1581 1582 1583 1585
 1587 1588 1590 1592 1598 1602 1606 1607 1608 1609 1610 1613 1615 1616
 1617 1618 1622 1623 1627 1635 1636 1637 1644 1645 1647 1648 1650 1652
 1653 1654 1656 1657 1658 1659 1661 1665 1667 1668 1669 1670 1672 1675
 1677 1678 1684 1686 1687 1688 1690 1692 1695 1696 1697 1698 1699 1700
 1701 1702 1703 1704 1706 1707 1708 1709 1710 1713 1714 1716 1717 1719
 1720 1722 1723 1724 1725 1726 1727 1728 1729 1730 1733 1734 17

  if diff:


Accuracy:  0.5049261083743842
Top 3 Accuracy:  0.6945812807881774
Top 5 Accuracy:  0.7783251231527094
Train: [   0    1    2 ... 4062 4063 4064] Validation: [ 967 1190 1252 1640 1680 1715 1752 1755 1779 1807 1863 1877 1879 1889
 1903 1919 1924 1935 1944 2009 2050 2060 2067 2077 2080 2091 2104 2119
 2127 2132 2134 2137 2144 2154 2158 2172 2186 2193 2203 2210 2216 2218
 2223 2233 2235 2263 2267 2272 2274 2278 2286 2289 2300 2310 2311 2315
 2319 2320 2328 2339 2342 2345 2349 2356 2357 2367 2368 2383 2386 2389
 2391 2392 2400 2403 2404 2410 2419 2421 2424 2426 2427 2428 2430 2432
 2433 2434 2436 2438 2439 2442 2443 2447 2448 2449 2450 2452 2454 2455
 2457 2460 2461 2463 2464 2465 2467 2468 2469 2470 2472 2473 2475 2477
 2478 2481 2483 2484 2485 2487 2488 2489 2490 2492 2493 2494 2496 2498
 2501 2504 2505 2506 2507 2508 2509 2510 2511 2512 2514 2515 2516 2518
 2519 2521 2525 2526 2528 2529 2530 2531 2532 2533 2534 2536 2538 2540
 2542 2543 2545 2546 2548 2549 2551 2552 2553 2556 2558 2559 2

  if diff:


Accuracy:  0.525062656641604
Top 3 Accuracy:  0.7130325814536341
Top 5 Accuracy:  0.7957393483709273
Train: [   0    1    2 ... 3657 3766 3915] Validation: [1801 1928 2480 2495 2524 2535 2547 2555 2594 2656 2746 2817 2826 2902
 2910 2918 2919 2925 2939 2964 2965 2967 2971 2980 3023 3033 3040 3046
 3048 3052 3062 3079 3093 3095 3107 3108 3112 3115 3121 3125 3128 3130
 3132 3134 3138 3140 3160 3161 3165 3166 3168 3172 3174 3176 3184 3186
 3188 3192 3194 3195 3213 3214 3215 3220 3222 3224 3229 3242 3244 3246
 3257 3258 3260 3261 3263 3265 3271 3273 3276 3279 3284 3286 3289 3292
 3295 3296 3301 3303 3304 3306 3307 3312 3313 3314 3317 3320 3325 3326
 3327 3328 3329 3330 3331 3333 3334 3335 3336 3337 3339 3340 3341 3342
 3343 3345 3346 3348 3349 3351 3352 3354 3357 3358 3359 3360 3361 3362
 3364 3365 3366 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378
 3379 3381 3382 3383 3385 3386 3387 3388 3389 3390 3392 3393 3394 3395
 3396 3397 3399 3400 3401 3402 3405 3406 3407 3408 3409 3410 34

  if diff:


Accuracy:  0.5203562340966921
Top 3 Accuracy:  0.7175572519083969
Top 5 Accuracy:  0.7964376590330788


In [58]:
import json

# with open('headers.txt', 'w') as file:
#     file.write(json.dumps(list(vect.get_feature_names())))
    
# with open('headers.txt', 'r') as file:
#     headers = eval(file.read())
    
headers = list(vect.get_feature_names())

# Test sample input

In [100]:
model_input = {
    'sr_id': '123456',
    'error_codes': [],
    'symptom': 'I think there is some problem with the clock.'
}

tmp_dict = {}
for header in headers:
    if isinstance(header, str) and header in preprocess.process_text(model_input['symptom']):
        tmp_dict[header] = 1
    elif isinstance(header, int) and header in model_input['error_codes']:
        tmp_dict[header] = 1
    else:
        tmp_dict[header] = 0

df = pd.DataFrame([tmp_dict], columns=tmp_dict.keys())


In [101]:
prediction_classes = ensemble.classes_[np.argsort(ensemble.predict_proba(df), axis=1)[:,-3:]][0][::-1].tolist()

In [102]:
prediction_prob = ensemble.predict_proba(df)[0][np.argsort(ensemble.predict_proba(df), axis=1)[:, -3:]][0][::-1].tolist()


In [103]:
res_dict = { "predictions": prediction_classes, "probabilities": prediction_prob }

to_json = json.dumps(res_dict)
to_json

'{"predictions": ["Reset_the_system_time_clock", "Perform_collimator_calibration", "Troubleshoot_CONSOLE_MOUSE"], "probabilities": [0.49759934311386256, 0.12014581625405019, 0.061545360441773514]}'

# Try creating a pickle and using it to predict

In [104]:
from sklearn.externals import joblib
filename = 'symptoms_only_model.pkl'
joblib.dump(ensemble, open(filename, 'wb'), protocol=2)
 
# some time later...
 

In [105]:
loaded_model2 = joblib.load(open(filename, 'rb'))

In [106]:
loaded_model2

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=65, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weig...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [107]:
# load the model from disk
loaded_model = joblib.load(open(filename, 'rb'))
prediction_classes = loaded_model.classes_[np.argsort(ensemble.predict_proba(df), axis=1)[:,-3:]][0][::-1].tolist()
prediction_prob = loaded_model.predict_proba(df)[0][np.argsort(ensemble.predict_proba(df), axis=1)[:, -3:]][0][::-1].tolist()


res_dict = {"predictions": prediction_classes, "probabilities": prediction_prob }

to_json = json.dumps(res_dict)
to_json

'{"predictions": ["Reset_the_system_time_clock", "Perform_collimator_calibration", "Troubleshoot_CONSOLE_MOUSE"], "probabilities": [0.4882628376930861, 0.1133756486365643, 0.060860154562006566]}'

# Now try to use error codes in our prediction 

In [108]:
error_codes_df = pd.read_csv('shortened_one.csv')
error_codes_df.error_codes = error_codes_df.error_codes.astype(str)

In [109]:
ec_list = error_codes_df.groupby('sr_id')['error_codes'].apply(set).apply(list)

In [110]:
ec_list_df = pd.DataFrame({'sr_id':ec_list.index, 'error_codes':ec_list.values})

In [111]:
ec_list_df.error_codes = ec_list_df.error_codes.apply(lambda x: ' '.join(x))

In [112]:
ec_list_mb_joined = pd.merge(ec_list_df, srdf, 'left') 

In [113]:
ec_list_mb_joined['merged_col'] = ec_list_mb_joined.symptom.str.cat(ec_list_mb_joined.error_codes, sep=' ')

In [114]:
ec_list_mb_joined

Unnamed: 0,error_codes,sr_id,Created_Date,Resolution_Code,symptom,merged_col
0,200288002 200003404 200304035 260100090 244540...,1-153565927726,9/6/2014 23:45,Perform_collimator_calibration,da failur first instal fast cal,da failur first instal fast cal 200288002 2000...
1,260134713 200002354 210000454 200109110 200281...,1-172373423251,6/3/2015 18:18,Troubleshoot_TABLE_SIDE_COVER,tabl stuck go,tabl stuck go 260134713 200002354 210000454 20...
2,230023070 200002392 200288002 200001830 200600...,1-172374512951,6/3/2015 18:52,Perform_LFC_and_system_software_reload,longer ilink upgrad direct connect,longer ilink upgrad direct connect 230023070 2...
3,200002392 200288002 200001830 200003404 200001...,1-172377040031,6/3/2015 20:06,Adjust_gantry_tilt_speed,set dmpr contact rep app tri remot suggest reb...,set dmpr contact rep app tri remot suggest reb...
4,230023070 200002392 200288002 200001830 200003...,1-172399424851,6/4/2015 12:37,Reset_the_system_time_clock,scanner clock run minut slow,scanner clock run minut slow 230023070 2000023...
5,200288002 230020406 200280011 200501547 230023...,1-172403194491,6/4/2015 14:14,Cycle_the_Console_power_or_restart_system_soft...,art stall engin perform fmi list select icon e...,art stall engin perform fmi list select icon e...
6,200288002 200280011 244520 230017088 200501547...,1-172405022091,6/4/2015 14:57,Troubleshoot_X_RAY_TUBE,tube start make nois scan done smell came room,tube start make nois scan done smell came room...
7,200288002 200280011 200501547 230023010 200002...,1-172407386691,6/4/2015 15:55,Perform_FastCals_Detailed_Cal_Full_Cal,water phantom huge air bubbl,water phantom huge air bubbl 200288002 2002800...
8,230023070 200002392 200288002 200001830 200001...,1-172414196391,6/4/2015 19:27,Reset_the_system_time_clock,system time drift,system time drift 230023070 200002392 20028800...
9,200288002 200280011 200501547 230023010 200002...,1-172417665135,6/4/2015 20:39,No structural problem found,state longer prompt new user log,state longer prompt new user log 200288002 200...


In [115]:
X = ec_list_mb_joined.merged_col
y = ec_list_mb_joined.Resolution_Code
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, stratify=y)

## ML Model (Taking into account symptoms and error codes) 

In [116]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
## Try out cross validation 
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    
    # create document-term matrices using the vectorizer
#     X_train_dtm = vect.fit_transform(X_train)
#     X_test_dtm = vect.transform(X_test)
    
    # print the number of features that were generated
#     print('Features: ', X_train_dtm.shape[1])
    
#     Multinomial Naive Bayes (43 percent accuracy)
#     nb = MultinomialNB(alpha=5)
#     nb.fit(X_train_dtm, y_train)
#     y_pred_class = nb.predict(X_test_dtm)
    
# #     use Random Forest (44 percent accuracy)
#     rf = RandomForestClassifier(max_depth=65, n_estimators = 200, random_state=0)
#     rf.fit(X_train_dtm, y_train)
#     y_pred_class = rf.predict(X_test_dtm)

# #     XG Boost (43 percent accuracy)
#     xgb = XGBClassifier()
#     xgb.fit(X_train_dtm, y_train)
#     y_pred_class = xgb.predict(X_test_dtm)
    
#     clf = GridSearchCV(SGDClassifier(), tuned_parameters, cv=5, scoring='accuracy', verbose=10)
#     clf.fit(X_train_dtm, y_train)
#     print(clf.best_params_)
#     y_pred_class = clf.predict(X_test_dtm)
#     print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
    
    
    
    # Cross Validation
    skf = StratifiedKFold(n_splits=5, random_state=None)
    # X is the feature set and y is the target
    for train_index, val_index in skf.split(X,y): 
        print("Train:", train_index, "Validation:", val_index)
        X_train, X_test = X[train_index], X[val_index] 
        y_train, y_test = y[train_index], y[val_index]
        X_train_dtm = vect.fit_transform(X_train)
        ## merge with error codes
        print(X_train_dtm)
        X_test_dtm = vect.transform(X_test)
        
        rf = RandomForestClassifier(max_depth=65, n_estimators = 200, random_state=1)
#         rf.fit(X_train_dtm, y_train)
#         y_pred_class = rf.predict_proba(X_test_dtm)
        
# #         nb = MultinomialNB(alpha=5)
# #         nb.fit(X_train_dtm, y_train)
# #         y_pred_class = nb.predict(X_test_dtm)

# #         xgb = XGBClassifier(eta=0.1)
# #         xgb.fit(X_train_dtm, y_train)
# #         y_pred_class = xgb.predict(X_test_dtm)
        sgd = SGDClassifier(alpha=0.001, average=100, learning_rate='optimal', loss='modified_huber', n_iter=5, penalty='l2')
#         sgd.fit(X_train_dtm, y_train)
#         y_pred_class = sgd.predict_proba(X_test_dtm)
    
#     Ensemble Learning
        ensemble = VotingClassifier(estimators=[('rf', rf), ('sgd', sgd)], voting='soft')
        ensemble.fit(X_train_dtm, y_train)
        y_pred_class = ensemble.predict(X_test_dtm)
        print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
        
        ## Use this if you want to get top N accuracy
        y_pred_class = ensemble.predict_proba(X_test_dtm)
        print('Top 3 Accuracy: ', top_n_accuracy(y_pred_class, y_test, 3, ensemble))
        print('Top 5 Accuracy: ', top_n_accuracy(y_pred_class, y_test, 5, ensemble))
    return ensemble

# tune TfidfVectorizer for better results
# vect = TfidfVectorizer(norm=None, max_features=3000, min_df=0.00008, ngram_range=(1,2), sublinear_tf=True, binary = True)
# vect = TfidfVectorizer(max_features=40000,
#                              min_df=5, 
#                              max_df=0.5, 
#                              analyzer='word')
# vect = TfidfVectorizer(analyzer='word')

# alpha=0.001, average=100, eta0=0.5, learning_rate=invscaling, loss=modified_huber, n_iter=5, penalty=l2 

vect = TfidfVectorizer(analyzer='word', min_df = 5, max_df = 0.5)

ensemble = tokenize_test(vect)



Train: [ 239  400  403 ... 4062 4063 4064] Validation: [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  

  if diff:


Accuracy:  0.49287410926365793
Top 3 Accuracy:  0.6947743467933492
Top 5 Accuracy:  0.7565320665083135
Train: [   0    1    2 ... 4062 4063 4064] Validation: [ 239  400  403  404  405  412  428  434  454  484  490  493  517  518
  525  548  549  550  559  581  585  587  588  598  601  602  603  607
  609  612  621  625  631  634  642  643  650  663  667  669  672  674
  678  680  683  687  690  691  694  702  706  709  717  720  722  726
  728  729  731  733  734  739  745  747  749  750  755  756  759  760
  761  764  768  770  771  772  774  778  782  785  786  788  789  791
  799  806  813  814  815  816  817  818  820  821  822  825  827  828
  829  833  834  835  836  838  842  846  848  849  851  852  855  856
  858  859  861  862  864  865  866  867  869  871  872  873  874  876
  877  878  879  880  881  882  883  884  886  888  889  890  891  896
  897  898  900  901  902  905  909  910  911  912  913  914  915  916
  917  918  920  921  922  923  924  925  928  930  931  932 

  if diff:


Accuracy:  0.5296251511487303
Top 3 Accuracy:  0.7267230955259976
Top 5 Accuracy:  0.7896009673518742
Train: [   0    1    2 ... 4062 4063 4064] Validation: [ 657  832  934  984  986 1030 1044 1116 1139 1141 1143 1144 1161 1165
 1174 1176 1177 1187 1188 1196 1208 1217 1218 1219 1222 1232 1234 1239
 1243 1247 1272 1279 1282 1314 1327 1331 1335 1340 1348 1356 1358 1363
 1369 1379 1388 1389 1403 1410 1415 1427 1444 1446 1452 1454 1456 1457
 1462 1470 1471 1473 1477 1478 1480 1492 1499 1504 1505 1506 1526 1532
 1535 1544 1550 1551 1558 1561 1567 1571 1575 1579 1581 1582 1583 1585
 1587 1588 1590 1592 1598 1602 1606 1607 1608 1609 1610 1613 1615 1616
 1617 1618 1622 1623 1627 1635 1636 1637 1644 1645 1647 1648 1650 1652
 1653 1654 1656 1657 1658 1659 1661 1665 1667 1668 1669 1670 1672 1675
 1677 1678 1684 1686 1687 1688 1690 1692 1695 1696 1697 1698 1699 1700
 1701 1702 1703 1704 1706 1707 1708 1709 1710 1713 1714 1716 1717 1719
 1720 1722 1723 1724 1725 1726 1727 1728 1729 1730 1733 1734 1

  if diff:


Accuracy:  0.4975369458128079
Top 3 Accuracy:  0.6884236453201971
Top 5 Accuracy:  0.7697044334975369
Train: [   0    1    2 ... 4062 4063 4064] Validation: [ 967 1190 1252 1640 1680 1715 1752 1755 1779 1807 1863 1877 1879 1889
 1903 1919 1924 1935 1944 2009 2050 2060 2067 2077 2080 2091 2104 2119
 2127 2132 2134 2137 2144 2154 2158 2172 2186 2193 2203 2210 2216 2218
 2223 2233 2235 2263 2267 2272 2274 2278 2286 2289 2300 2310 2311 2315
 2319 2320 2328 2339 2342 2345 2349 2356 2357 2367 2368 2383 2386 2389
 2391 2392 2400 2403 2404 2410 2419 2421 2424 2426 2427 2428 2430 2432
 2433 2434 2436 2438 2439 2442 2443 2447 2448 2449 2450 2452 2454 2455
 2457 2460 2461 2463 2464 2465 2467 2468 2469 2470 2472 2473 2475 2477
 2478 2481 2483 2484 2485 2487 2488 2489 2490 2492 2493 2494 2496 2498
 2501 2504 2505 2506 2507 2508 2509 2510 2511 2512 2514 2515 2516 2518
 2519 2521 2525 2526 2528 2529 2530 2531 2532 2533 2534 2536 2538 2540
 2542 2543 2545 2546 2548 2549 2551 2552 2553 2556 2558 2559 2

  if diff:


Accuracy:  0.5288220551378446
Top 3 Accuracy:  0.7180451127819549
Top 5 Accuracy:  0.7969924812030075
Train: [   0    1    2 ... 3657 3766 3915] Validation: [1801 1928 2480 2495 2524 2535 2547 2555 2594 2656 2746 2817 2826 2902
 2910 2918 2919 2925 2939 2964 2965 2967 2971 2980 3023 3033 3040 3046
 3048 3052 3062 3079 3093 3095 3107 3108 3112 3115 3121 3125 3128 3130
 3132 3134 3138 3140 3160 3161 3165 3166 3168 3172 3174 3176 3184 3186
 3188 3192 3194 3195 3213 3214 3215 3220 3222 3224 3229 3242 3244 3246
 3257 3258 3260 3261 3263 3265 3271 3273 3276 3279 3284 3286 3289 3292
 3295 3296 3301 3303 3304 3306 3307 3312 3313 3314 3317 3320 3325 3326
 3327 3328 3329 3330 3331 3333 3334 3335 3336 3337 3339 3340 3341 3342
 3343 3345 3346 3348 3349 3351 3352 3354 3357 3358 3359 3360 3361 3362
 3364 3365 3366 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378
 3379 3381 3382 3383 3385 3386 3387 3388 3389 3390 3392 3393 3394 3395
 3396 3397 3399 3400 3401 3402 3405 3406 3407 3408 3409 3410 3

  if diff:


Accuracy:  0.5254452926208651
Top 3 Accuracy:  0.72264631043257
Top 5 Accuracy:  0.8027989821882952


In [117]:
import json

# with open('headers.txt', 'w') as file:
#     file.write(json.dumps(list(vect.get_feature_names())))
    
# with open('headers.txt', 'r') as file:
#     headers = eval(file.read())
    
headers = list(vect.get_feature_names())

In [118]:
model_input = {
    'sr_id': '123456',
    'error_codes': [],
    'symptom': 'WHEN USING THE SMALL HEAD FOV  THERE APPEARS TO BE MOTION IN EACH SCAN  TWO DIFFERENT TECHS HAD THIS CONCERN AND FELT THE PATIENTS WERE NOT MOVING. THEY FEEL THIS MAY BE AN ARTIFACT..  WE ARE NOT CURRENTLY SCANNING HEADS IN THIS ROOM DUE TO THE POSSIBLE ARTIFACT EXAM NUMBER 40035  40032  40031 THE MOTION IS SEEN ON THE AXIAL SAG AND CORONAL SLICES Image:NONE-NONE-NONE'
}

tmp_dict = {}
for header in headers:
    if isinstance(header, str) and header in preprocess.process_text(model_input['symptom']):
        tmp_dict[header] = 1
    elif isinstance(header, int) and header in model_input['error_codes']:
        tmp_dict[header] = 1
    else:
        tmp_dict[header] = 0

df = pd.DataFrame([tmp_dict], columns=tmp_dict.keys())


In [119]:
prediction_classes = ensemble.classes_[np.argsort(ensemble.predict_proba(df), axis=1)[:,-3:]][0][::-1].tolist()

In [120]:
prediction_prob = ensemble.predict_proba(df)[0][np.argsort(ensemble.predict_proba(df), axis=1)[:, -3:]][0][::-1].tolist()


In [121]:
res_dict = { "predictions": prediction_classes, "probabilities": prediction_prob }

to_json = json.dumps(res_dict)
to_json

'{"predictions": ["Troubleshoot_HEMIT_TANK", "Perform_FastCals_Detailed_Cal_Full_Cal", "No structural problem found"], "probabilities": [0.3932977097204952, 0.3020196154203789, 0.037737689708336936]}'

In [122]:
from sklearn.externals import joblib
filename = 'full_model.pkl'
joblib.dump(ensemble, open(filename, 'wb'), protocol=2)
 
# some time later...
 

In [123]:
loaded_model2 = joblib.load(open(filename, 'rb'))

In [124]:
# load the model from disk
loaded_model = joblib.load(open(filename, 'rb'))
prediction_classes = loaded_model.classes_[np.argsort(ensemble.predict_proba(df), axis=1)[:,-3:]][0][::-1].tolist()
prediction_prob = loaded_model.predict_proba(df)[0][np.argsort(ensemble.predict_proba(df), axis=1)[:, -3:]][0][::-1].tolist()


res_dict = {"predictions": prediction_classes, "probabilities": prediction_prob }

to_json = json.dumps(res_dict)
to_json

'{"predictions": ["Troubleshoot_HEMIT_TANK", "Perform_FastCals_Detailed_Cal_Full_Cal", "No structural problem found"], "probabilities": [0.4383187687672548, 0.25699855637361946, 0.037737689708336936]}'