# Using symptoms for prediction

In [1]:
import pandas as pd
import numpy as np
import sklearn
import re
import string
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction import text
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
# import service request csv
srdf = pd.read_csv("analytics/service_requests_Train-test_set.csv")

## Preprocess Text Functions

In [3]:
# clean text

class Preprocess:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.remove_list = stopwords.words('english')
        #self.remove_list.remove('no')
        self.remove_list += ['please', 'thank', 'pacific', 'canada',
        'none', 'florida', 'cst', 'tue', 'mon', 'wed', 'thu', 'fri', 'sat', 'monday', 'tuesday', 'wednesday', 'thursday','friday', 'saturday', 'sunday',
        'sun', 'cd', 'gmt', 'hw', 'sw', 'sg', 'error', 'still', 'need', 'call', 'service','customer', 'yesterday', 'today', 'year', 'yet', 'now', 'okay', 'spoke', 'spoken', 'no', 'benjamin', 'matthew', 'susan','jean', 'jason', 'hanelle', 'jan', 'feb', 'march', 'april', 'may','june', 'july', 'august', 'september', 'october', 'november', 'december']
        self.spelling_map = {
            'gantri': 'gantry',
            'gantree': 'gantry',
            'ystem': 'system',
            'patietn': 'patient',
            'patint': 'patient',
            'dispacth': 'dispatch'
        }

    def process_text(self, text):
        comp = re.compile(r'Image:[\S]*')
        text = comp.sub('', text)
        terms = self.tokenize(self.clean_text(text).lower().strip())
        clean_terms = [self.spelling_map.get(term) or term for term in terms if term not in self.remove_list and len(term) > 1]
        stemmed_tokens = list(self.porter_stem(clean_terms))
        # remove weird punctuation and numbers and images
        return ' '.join(stemmed_tokens)

    def clean_text(self, text):
        final_text = ''
        for i in text:
            if i == ' ':
                final_text += i
            elif not i.isalpha():
                final_text += ' '
            else:
                final_text += i
        return final_text

    def tokenize(self, text):
        terms = []
        # tokenize sentences before words
        sentences = nltk.sent_tokenize(text)
        for sent in sentences:
            terms += nltk.word_tokenize(sent)
        return terms

    def porter_stem(self, terms):
        for term in terms:
            yield self.stemmer.stem(term)


## Clean text

In [4]:
preprocess = Preprocess()
clean_srdf = srdf
clean_srdf['symptom'] = srdf['symptom'].apply(preprocess.process_text)


In [5]:
X = clean_srdf.symptom
y = clean_srdf.Resolution_Code
## Use int as labels for Y (Eg turn Perform_software_config into 31)
# y = clean_srdf.Resolution_Code.astype('category').cat.codes

# split X and y into training and testing sets
# from sklearn.model_selection import train_test_split
## Stratify to account for imbalance in dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, stratify=y)

In [6]:
tuned_parameters = [{'loss': ['modified_huber', 'squared_hinge'], 'penalty': ['l2'],
                     'alpha': [1e-3, 1e-4, 1e-5], 'n_iter': [5], 'learning_rate' : ['constant', 'invscaling'], 'eta0': [0.5, 0.1, 0.05], 'average' : [True, 10, 50, 100]}] 

## Function to find the top N accuracy 

In [7]:
def top_n_accuracy(preds, truths, n, model):
    best_n = np.argsort(preds, axis=1)[:,-n:]
    successes = 0
    for i in range(truths.shape[0]):
        if truths.iloc[i] in model.classes_[best_n[i,:]]:
            successes += 1
    return float(successes)/truths.shape[0]

## ML Model (solely using symptoms)

In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
## Try out cross validation 
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    
    # create document-term matrices using the vectorizer
#     X_train_dtm = vect.fit_transform(X_train)
#     X_test_dtm = vect.transform(X_test)
    
    # print the number of features that were generated
#     print('Features: ', X_train_dtm.shape[1])
    
#     Multinomial Naive Bayes (43 percent accuracy)
#     nb = MultinomialNB(alpha=5)
#     nb.fit(X_train_dtm, y_train)
#     y_pred_class = nb.predict(X_test_dtm)
    
# #     use Random Forest (44 percent accuracy)
#     rf = RandomForestClassifier(max_depth=65, n_estimators = 200, random_state=0)
#     rf.fit(X_train_dtm, y_train)
#     y_pred_class = rf.predict(X_test_dtm)

# #     XG Boost (43 percent accuracy)
#     xgb = XGBClassifier()
#     xgb.fit(X_train_dtm, y_train)
#     y_pred_class = xgb.predict(X_test_dtm)

## Used gridsearch to find best parameters for each ml model 
    
#     clf = GridSearchCV(SGDClassifier(), tuned_parameters, cv=5, scoring='accuracy', verbose=10)
#     clf.fit(X_train_dtm, y_train)
#     print(clf.best_params_)
#     y_pred_class = clf.predict(X_test_dtm)
#     print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
    
    
    
    # Cross Validation
    skf = StratifiedKFold(n_splits=5, random_state=None)
    # X is the feature set and y is the target
    for train_index, val_index in skf.split(X,y): 
        print("Train:", train_index, "Validation:", val_index)
        X_train, X_test = X[train_index], X[val_index] 
        y_train, y_test = y[train_index], y[val_index]
        X_train_dtm = vect.fit_transform(X_train)
        ## merge with error codes
        print(X_train_dtm)
        X_test_dtm = vect.transform(X_test)
        
        rf = RandomForestClassifier(max_depth=65, n_estimators = 200, random_state=1)
#         rf.fit(X_train_dtm, y_train)
#         y_pred_class = rf.predict_proba(X_test_dtm)
        
# #         nb = MultinomialNB(alpha=5)
# #         nb.fit(X_train_dtm, y_train)
# #         y_pred_class = nb.predict(X_test_dtm)

# #         xgb = XGBClassifier(eta=0.1)
# #         xgb.fit(X_train_dtm, y_train)
# #         y_pred_class = xgb.predict(X_test_dtm)
        sgd = SGDClassifier(alpha=0.001, average=100, learning_rate='optimal', loss='modified_huber', n_iter=5, penalty='l2')
#         sgd.fit(X_train_dtm, y_train)
#         y_pred_class = sgd.predict_proba(X_test_dtm)
    
#     Ensemble Learning
        ensemble_sym = VotingClassifier(estimators=[('rf', rf), ('sgd', sgd)], voting='soft')
        ensemble_sym.fit(X_train_dtm, y_train)
        y_pred_class = ensemble_sym.predict(X_test_dtm)
        print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
        
        ## Use this if you want to get top N accuracy
        y_pred_class = ensemble_sym.predict_proba(X_test_dtm)
        print('Top 3 Accuracy: ', top_n_accuracy(y_pred_class, y_test, 3, ensemble_sym))
        print('Top 5 Accuracy: ', top_n_accuracy(y_pred_class, y_test, 5, ensemble_sym))

    return ensemble_sym

# tune TfidfVectorizer for better results
# vect = TfidfVectorizer(norm=None, max_features=3000, min_df=0.00008, ngram_range=(1,2), sublinear_tf=True, binary = True)
# vect = TfidfVectorizer(max_features=40000,
#                              min_df=5, 
#                              max_df=0.5, 
#                              analyzer='word')
# vect = TfidfVectorizer(analyzer='word')

# alpha=0.001, average=100, eta0=0.5, learning_rate=invscaling, loss=modified_huber, n_iter=5, penalty=l2 

vect_sym = TfidfVectorizer(analyzer='word', min_df = 5, max_df = 0.5)

ensemble_sym = tokenize_test(vect_sym)



Train: [ 475  621  638 ... 6733 6734 6735] Validation: [   0    1    2 ... 2177 2215 2234]
  (0, 204)	0.20430721879157157
  (0, 383)	0.12888262509755055
  (0, 549)	0.4447469821866934
  (0, 497)	0.13630011241306433
  (0, 348)	0.18523936701637259
  (0, 849)	0.20944815559587437
  (0, 762)	0.15870110538953652
  (0, 519)	0.21206900826067385
  (0, 967)	0.18101110264033526
  (0, 896)	0.21784221056481853
  (0, 394)	0.18035819401843714
  (0, 736)	0.3534009693028856
  (0, 841)	0.1374457392955314
  (0, 923)	0.18235875054520576
  (0, 751)	0.24197533438149724
  (0, 689)	0.2377470700054599
  (0, 901)	0.10265824017677254
  (0, 329)	0.1910125693205173
  (0, 733)	0.2223734910933467
  (0, 300)	0.11403179350477391
  (0, 975)	0.16730448472176102
  (0, 328)	0.18600241469294382
  (1, 849)	0.22569398888433712
  (1, 762)	0.17101074685435708
  (1, 923)	0.39300679159880814
  :	:
  (5359, 889)	0.6385762509886008
  (5359, 155)	0.23315751029956067
  (5359, 792)	0.24062363123864614
  (5360, 984)	0.6398953694345755


  if diff:


Accuracy:  0.46647230320699706
Top 3 Accuracy:  0.6545189504373178
Top 5 Accuracy:  0.7376093294460642
Train: [   0    1    2 ... 6733 6734 6735] Validation: [ 475  621  638 ... 3883 4002 4273]
  (0, 936)	0.6488322610174964
  (0, 312)	0.5422912749301343
  (0, 170)	0.5337947828497137
  (1, 306)	0.6291297319955222
  (1, 803)	0.48363701526350733
  (1, 63)	0.2705377428393211
  (1, 342)	0.4565182412487968
  (1, 301)	0.29781108591313193
  (2, 885)	0.3756995235556908
  (2, 738)	0.3431412251936053
  (2, 498)	0.3488336510351094
  (2, 755)	0.45484909152314534
  (2, 685)	0.33418662260955506
  (2, 453)	0.2748025037286502
  (2, 891)	0.47469383883933786
  (3, 279)	0.6105067671425208
  (3, 257)	0.5583597596581278
  (3, 518)	0.5617079900337059
  (4, 689)	0.4067126982663103
  (4, 726)	0.442046150277259
  (4, 543)	0.4067126982663103
  (4, 990)	0.35193264664670804
  (4, 150)	0.31464460474199757
  (4, 457)	0.5009061270277345
  (5, 63)	0.327097506356877
  :	:
  (5369, 558)	0.20489841046169713
  (5369, 79)	

  if diff:


Accuracy:  0.4486049926578561
Top 3 Accuracy:  0.6475770925110133
Top 5 Accuracy:  0.7305433186490455
Train: [   0    1    2 ... 6733 6734 6735] Validation: [1856 1953 2050 ... 4594 4624 4913]
  (0, 929)	0.6452040375900203
  (0, 299)	0.5462734193706527
  (0, 162)	0.5341321008576723
  (1, 293)	0.6297587042782352
  (1, 791)	0.47118355335112744
  (1, 58)	0.2715272420128955
  (1, 328)	0.46782128011207713
  (1, 287)	0.2980037586279264
  (2, 875)	0.3713065227661224
  (2, 727)	0.343018448979103
  (2, 481)	0.352514966061173
  (2, 743)	0.4618939456502762
  (2, 670)	0.33839984500802545
  (2, 436)	0.27448440802636725
  (2, 882)	0.4658334961493971
  (3, 266)	0.6248990457747348
  (3, 243)	0.5535475502228212
  (3, 502)	0.5505327349323917
  (4, 674)	0.40942863555237397
  (4, 714)	0.43809977622246443
  (4, 527)	0.40942863555237397
  (4, 980)	0.3564805446867573
  (4, 141)	0.30759302051083276
  (4, 440)	0.5011118895477581
  (5, 58)	0.3249059007008189
  :	:
  (5382, 543)	0.20242830548986404
  (5382, 74)	

  if diff:


Accuracy:  0.47220163083765754
Top 3 Accuracy:  0.6508524833209784
Top 5 Accuracy:  0.7442550037064493
Train: [   0    1    2 ... 6733 6734 6735] Validation: [2838 2970 2979 ... 5884 5989 6084]
  (0, 939)	0.6513116628316743
  (0, 314)	0.5399062151329602
  (0, 171)	0.5331926450357705
  (1, 308)	0.6326573440507262
  (1, 804)	0.47341020917588295
  (1, 62)	0.2715456376265386
  (1, 342)	0.46366798718917895
  (1, 302)	0.294792169517204
  (2, 887)	0.371227475057605
  (2, 736)	0.3469389337113125
  (2, 494)	0.3469389337113125
  (2, 751)	0.46046422680903576
  (2, 680)	0.33677041931648455
  (2, 449)	0.2765369438085468
  (2, 892)	0.4685537466763604
  (3, 281)	0.63170680085697
  (3, 256)	0.5326547566688393
  (3, 514)	0.5632276874844775
  (4, 684)	0.40640178594885873
  (4, 723)	0.43992080348754253
  (4, 536)	0.4133152625011573
  (4, 990)	0.35983470442951043
  (4, 151)	0.3127582098851059
  (4, 453)	0.49313188546031306
  (5, 62)	0.32616347242582233
  :	:
  (5397, 554)	0.20249736109283228
  (5397, 80)	

  if diff:


Accuracy:  0.4737631184407796
Top 3 Accuracy:  0.6664167916041979
Top 5 Accuracy:  0.7473763118440779
Train: [   0    1    2 ... 5884 5989 6084] Validation: [4535 4678 4694 ... 6733 6734 6735]
  (0, 955)	0.6484211508676381
  (0, 315)	0.5445464276124524
  (0, 172)	0.5319954880278627
  (1, 309)	0.6310977467850587
  (1, 820)	0.47581103345515496
  (1, 63)	0.27297605568987976
  (1, 344)	0.4610726258301568
  (1, 304)	0.2970111128817222
  (2, 903)	0.3668770274442805
  (2, 753)	0.33881188105081866
  (2, 502)	0.34946613387506265
  (2, 769)	0.4587940315556639
  (2, 695)	0.33193444590043725
  (2, 454)	0.2763974122517217
  (2, 908)	0.4810542672101488
  (3, 280)	0.6320707828950216
  (3, 256)	0.5421731935264756
  (3, 522)	0.5536558079093701
  (4, 699)	0.4054336890693562
  (4, 740)	0.43772401122826876
  (4, 548)	0.41540896608949207
  (4, 1006)	0.3583084945124999
  (4, 151)	0.3128986372815928
  (4, 460)	0.49514247464928135
  (5, 63)	0.32775149977146495
  :	:
  (5413, 217)	0.2972344341221599
  (5413, 1

  if diff:


Accuracy:  0.45716451857467777
Top 3 Accuracy:  0.6664139499620925
Top 5 Accuracy:  0.7452615617892343


In [9]:
import json

# with open('headers.txt', 'w') as file:
#     file.write(json.dumps(list(vect.get_feature_names())))
    
# with open('headers.txt', 'r') as file:
#     headers = eval(file.read())
    
headers_sym = list(vect_sym.get_feature_names())

In [10]:
headers_sym

['abd',
 'abdomen',
 'abl',
 'abort',
 'accept',
 'access',
 'accur',
 'accuraci',
 'acquir',
 'acquisit',
 'acr',
 'across',
 'act',
 'activ',
 'actual',
 'ad',
 'add',
 'addit',
 'address',
 'adjust',
 'admin',
 'administr',
 'advanc',
 'advis',
 'ae',
 'affect',
 'afternoon',
 'ago',
 'ahead',
 'air',
 'alarm',
 'alert',
 'align',
 'allow',
 'almost',
 'alreadi',
 'also',
 'alt',
 'altern',
 'although',
 'alway',
 'angio',
 'angl',
 'annual',
 'anoth',
 'anyon',
 'anyth',
 'anytim',
 'anywher',
 'ap',
 'app',
 'appear',
 'applic',
 'appropri',
 'approv',
 'approx',
 'approxim',
 'arc',
 'area',
 'around',
 'array',
 'arriv',
 'arrow',
 'artifact',
 'asap',
 'ask',
 'assist',
 'atlanta',
 'attach',
 'attempt',
 'attent',
 'aug',
 'auth',
 'author',
 'auto',
 'automat',
 'autovoic',
 'avail',
 'aw',
 'awar',
 'away',
 'axial',
 'back',
 'bad',
 'balanc',
 'bang',
 'bar',
 'base',
 'bat',
 'batteri',
 'beam',
 'bear',
 'bed',
 'beep',
 'began',
 'begin',
 'behind',
 'believ',
 'bent',


# Test sample input

In [11]:
model_input = {
    'sr_id': '123456',
    'error_codes': [],
    'symptom': 'Table is making a noise when it is lowered  like plastic is catching on something.'
}

tmp_dict = {}
for header in headers_sym:
    if isinstance(header, str) and header in preprocess.process_text(model_input['symptom']):
        tmp_dict[header] = 1
    elif isinstance(header, int) and header in model_input['error_codes']:
        tmp_dict[header] = 1
    else:
        tmp_dict[header] = 0

df = pd.DataFrame([tmp_dict], columns=tmp_dict.keys())


In [12]:
prediction_classes = ensemble_sym.classes_[np.argsort(ensemble_sym.predict_proba(df), axis=1)[:,-3:]][0][::-1].tolist()

In [13]:
prediction_prob = ensemble_sym.predict_proba(df)[0][np.argsort(ensemble_sym.predict_proba(df), axis=1)[:, -3:]][0][::-1].tolist()


In [14]:
res_dict = { "predictions": prediction_classes, "probabilities": prediction_prob }

to_json = json.dumps(res_dict)
to_json

'{"predictions": ["Adjust_or_tighten_table_cover_or_touch_switch_or_sensor", "Cycle_the_Gantry_and_table_power_or_Reset_scan_hardware", "Troubleshoot_X_RAY_TUBE"], "probabilities": [0.3405346081975526, 0.1714894986631803, 0.06171984863908942]}'

# Try creating a pickle and using it to predict

In [15]:
from sklearn.externals import joblib
filename = 'symptoms_only_model.pkl'
joblib.dump(ensemble_sym, open(filename, 'wb'), protocol=2)
 
# some time later...
 

In [16]:
loaded_model2 = joblib.load(open(filename, 'rb'))

In [17]:
loaded_model2

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=65, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weig...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [18]:
# load the model from disk
loaded_model = joblib.load(open(filename, 'rb'))
prediction_classes = loaded_model.classes_[np.argsort(ensemble_sym.predict_proba(df), axis=1)[:,-3:]][0][::-1].tolist()
prediction_prob = loaded_model.predict_proba(df)[0][np.argsort(ensemble_sym.predict_proba(df), axis=1)[:, -3:]][0][::-1].tolist()


res_dict = {"predictions": prediction_classes, "probabilities": prediction_prob }

to_json = json.dumps(res_dict)
to_json

'{"predictions": ["Adjust_or_tighten_table_cover_or_touch_switch_or_sensor", "Cycle_the_Gantry_and_table_power_or_Reset_scan_hardware", "Troubleshoot_X_RAY_TUBE"], "probabilities": [0.33446863093915025, 0.1663504749253793, 0.08560031684543798]}'

# Now try to use error codes in our prediction 

In [19]:
## This error codes is filtered form the joblib file. Contains error codes for all service incidents in the past one day, with NA values dropped

error_codes_df = pd.read_csv('shortened_one.csv')
error_codes_df.error_codes = error_codes_df.error_codes.astype(str)

In [20]:
ec_list = error_codes_df.groupby('sr_id')['error_codes'].apply(set).apply(list)

In [21]:
ec_list_df = pd.DataFrame({'sr_id':ec_list.index, 'error_codes':ec_list.values})

In [22]:
ec_list_df.error_codes = ec_list_df.error_codes.apply(lambda x: ' '.join(x))

In [23]:
ec_list_mb_joined = pd.merge(ec_list_df, srdf, 'left') 

In [24]:
ec_list_mb_joined['merged_col'] = ec_list_mb_joined.symptom.str.cat(ec_list_mb_joined.error_codes, sep=' ')

In [25]:
ec_list_mb_joined

Unnamed: 0,error_codes,sr_id,Created_Date,Resolution_Code,symptom,merged_col
0,200003404 260142209 245278 200281001 200003405...,1-153565927726,9/6/2014 23:45,Perform_collimator_calibration,da failur first instal fast cal,da failur first instal fast cal 200003404 2601...
1,200002379 200109110 210000454 200002354 200281...,1-172373423251,6/3/2015 18:18,Troubleshoot_TABLE_SIDE_COVER,tabl stuck go,tabl stuck go 200002379 200109110 210000454 20...
2,0 245278 230015038 200281001 244649 245279 244...,1-172374512951,6/3/2015 18:52,Perform_LFC_and_system_software_reload,longer ilink upgrad direct connect,longer ilink upgrad direct connect 0 245278 23...
3,200003404 0 200003402 14091 200281001 244649 2...,1-172377040031,6/3/2015 20:06,Adjust_gantry_tilt_speed,set dmpr contact rep app tri remot suggest reb...,set dmpr contact rep app tri remot suggest reb...
4,200003404 0 245278 200003402 200281001 244649 ...,1-172399424851,6/4/2015 12:37,Reset_the_system_time_clock,scanner clock run minut slow,scanner clock run minut slow 200003404 0 24527...
5,230020406 245278 200281001 200110035 260132614...,1-172403194491,6/4/2015 14:14,Cycle_the_Console_power_or_restart_system_soft...,art stall engin perform fmi list select icon e...,art stall engin perform fmi list select icon e...
6,244655 245278 200281001 200110035 245303 26014...,1-172405022091,6/4/2015 14:57,Troubleshoot_X_RAY_TUBE,tube start make nois scan done smell came room,tube start make nois scan done smell came room...
7,245278 200281001 200502003 200110035 245303 26...,1-172407386691,6/4/2015 15:55,Perform_FastCals_Detailed_Cal_Full_Cal,water phantom huge air bubbl,water phantom huge air bubbl 245278 200281001 ...
8,260134736 7 0 245278 230015038 244656 20028100...,1-172414196391,6/4/2015 19:27,Reset_the_system_time_clock,system time drift,system time drift 260134736 7 0 245278 2300150...
9,245278 200281001 200110035 245303 260140018 20...,1-172417665135,6/4/2015 20:39,No structural problem found,state longer prompt new user log,state longer prompt new user log 245278 200281...


In [26]:
X = ec_list_mb_joined.merged_col
y = ec_list_mb_joined.Resolution_Code
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, stratify=y)

## ML Model (Taking into account symptoms and error codes) 

In [27]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
## Try out cross validation 
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    
    # create document-term matrices using the vectorizer
#     X_train_dtm = vect.fit_transform(X_train)
#     X_test_dtm = vect.transform(X_test)
    
    # print the number of features that were generated
#     print('Features: ', X_train_dtm.shape[1])
    
#     Multinomial Naive Bayes (43 percent accuracy)
#     nb = MultinomialNB(alpha=5)
#     nb.fit(X_train_dtm, y_train)
#     y_pred_class = nb.predict(X_test_dtm)
    
# #     use Random Forest (44 percent accuracy)
#     rf = RandomForestClassifier(max_depth=65, n_estimators = 200, random_state=0)
#     rf.fit(X_train_dtm, y_train)
#     y_pred_class = rf.predict(X_test_dtm)

# #     XG Boost (43 percent accuracy)
#     xgb = XGBClassifier()
#     xgb.fit(X_train_dtm, y_train)
#     y_pred_class = xgb.predict(X_test_dtm)
    
#     clf = GridSearchCV(SGDClassifier(), tuned_parameters, cv=5, scoring='accuracy', verbose=10)
#     clf.fit(X_train_dtm, y_train)
#     print(clf.best_params_)
#     y_pred_class = clf.predict(X_test_dtm)
#     print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
    
    
    
    # Cross Validation
    skf = StratifiedKFold(n_splits=5, random_state=None)
    # X is the feature set and y is the target
    for train_index, val_index in skf.split(X,y): 
        print("Train:", train_index, "Validation:", val_index)
        X_train, X_test = X[train_index], X[val_index] 
        y_train, y_test = y[train_index], y[val_index]
        X_train_dtm = vect.fit_transform(X_train)
        ## merge with error codes
        print(X_train_dtm)
        X_test_dtm = vect.transform(X_test)
        
        rf = RandomForestClassifier(max_depth=65, n_estimators = 200, random_state=1)
#         rf.fit(X_train_dtm, y_train)
#         y_pred_class = rf.predict_proba(X_test_dtm)
        
# #         nb = MultinomialNB(alpha=5)
# #         nb.fit(X_train_dtm, y_train)
# #         y_pred_class = nb.predict(X_test_dtm)

# #         xgb = XGBClassifier(eta=0.1)
# #         xgb.fit(X_train_dtm, y_train)
# #         y_pred_class = xgb.predict(X_test_dtm)
        sgd = SGDClassifier(alpha=0.001, average=100, learning_rate='optimal', loss='modified_huber', n_iter=5, penalty='l2')
#         sgd.fit(X_train_dtm, y_train)
#         y_pred_class = sgd.predict_proba(X_test_dtm)
    
#     Ensemble Learning
        ensemble_sym_error = VotingClassifier(estimators=[('rf', rf), ('sgd', sgd)], voting='soft')
        ensemble_sym_error.fit(X_train_dtm, y_train)
        y_pred_class = ensemble_sym_error.predict(X_test_dtm)
        print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
        
        ## Use this if you want to get top N accuracy
        y_pred_class = ensemble_sym_error.predict_proba(X_test_dtm)
        print('Top 3 Accuracy: ', top_n_accuracy(y_pred_class, y_test, 3, ensemble_sym_error))
        print('Top 5 Accuracy: ', top_n_accuracy(y_pred_class, y_test, 5, ensemble_sym_error))
    return ensemble_sym_error

# tune TfidfVectorizer for better results
# vect = TfidfVectorizer(norm=None, max_features=3000, min_df=0.00008, ngram_range=(1,2), sublinear_tf=True, binary = True)
# vect = TfidfVectorizer(max_features=40000,
#                              min_df=5, 
#                              max_df=0.5, 
#                              analyzer='word')
# vect = TfidfVectorizer(analyzer='word')

# alpha=0.001, average=100, eta0=0.5, learning_rate=invscaling, loss=modified_huber, n_iter=5, penalty=l2 

vect_sym_error = TfidfVectorizer(analyzer='word', min_df = 5, max_df = 0.5)

ensemble_sym_error = tokenize_test(vect_sym_error)



Train: [ 239  400  403 ... 4062 4063 4064] Validation: [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  176  177  178  179  180  181
  182  183  184  185  

  if diff:


Accuracy:  0.48812351543942994
Top 3 Accuracy:  0.6876484560570071
Top 5 Accuracy:  0.7577197149643705
Train: [   0    1    2 ... 4062 4063 4064] Validation: [ 239  400  403  404  405  412  428  434  454  484  490  493  517  518
  525  548  549  550  559  581  585  587  588  598  601  602  603  607
  609  612  621  625  631  634  642  643  650  663  667  669  672  674
  678  680  683  687  690  691  694  702  706  709  717  720  722  726
  728  729  731  733  734  739  745  747  749  750  755  756  759  760
  761  764  768  770  771  772  774  778  782  785  786  788  789  791
  799  806  813  814  815  816  817  818  820  821  822  825  827  828
  829  833  834  835  836  838  842  846  848  849  851  852  855  856
  858  859  861  862  864  865  866  867  869  871  872  873  874  876
  877  878  879  880  881  882  883  884  886  888  889  890  891  896
  897  898  900  901  902  905  909  910  911  912  913  914  915  916
  917  918  920  921  922  923  924  925  928  930  931  932 

  if diff:


Accuracy:  0.532043530834341
Top 3 Accuracy:  0.7255139056831923
Top 5 Accuracy:  0.7944377267230955
Train: [   0    1    2 ... 4062 4063 4064] Validation: [ 657  832  934  984  986 1030 1044 1116 1139 1141 1143 1144 1161 1165
 1174 1176 1177 1187 1188 1196 1208 1217 1218 1219 1222 1232 1234 1239
 1243 1247 1272 1279 1282 1314 1327 1331 1335 1340 1348 1356 1358 1363
 1369 1379 1388 1389 1403 1410 1415 1427 1444 1446 1452 1454 1456 1457
 1462 1470 1471 1473 1477 1478 1480 1492 1499 1504 1505 1506 1526 1532
 1535 1544 1550 1551 1558 1561 1567 1571 1575 1579 1581 1582 1583 1585
 1587 1588 1590 1592 1598 1602 1606 1607 1608 1609 1610 1613 1615 1616
 1617 1618 1622 1623 1627 1635 1636 1637 1644 1645 1647 1648 1650 1652
 1653 1654 1656 1657 1658 1659 1661 1665 1667 1668 1669 1670 1672 1675
 1677 1678 1684 1686 1687 1688 1690 1692 1695 1696 1697 1698 1699 1700
 1701 1702 1703 1704 1706 1707 1708 1709 1710 1713 1714 1716 1717 1719
 1720 1722 1723 1724 1725 1726 1727 1728 1729 1730 1733 1734 17

  if diff:


Accuracy:  0.4963054187192118
Top 3 Accuracy:  0.7007389162561576
Top 5 Accuracy:  0.7795566502463054
Train: [   0    1    2 ... 4062 4063 4064] Validation: [ 967 1190 1252 1640 1680 1715 1752 1755 1779 1807 1863 1877 1879 1889
 1903 1919 1924 1935 1944 2009 2050 2060 2067 2077 2080 2091 2104 2119
 2127 2132 2134 2137 2144 2154 2158 2172 2186 2193 2203 2210 2216 2218
 2223 2233 2235 2263 2267 2272 2274 2278 2286 2289 2300 2310 2311 2315
 2319 2320 2328 2339 2342 2345 2349 2356 2357 2367 2368 2383 2386 2389
 2391 2392 2400 2403 2404 2410 2419 2421 2424 2426 2427 2428 2430 2432
 2433 2434 2436 2438 2439 2442 2443 2447 2448 2449 2450 2452 2454 2455
 2457 2460 2461 2463 2464 2465 2467 2468 2469 2470 2472 2473 2475 2477
 2478 2481 2483 2484 2485 2487 2488 2489 2490 2492 2493 2494 2496 2498
 2501 2504 2505 2506 2507 2508 2509 2510 2511 2512 2514 2515 2516 2518
 2519 2521 2525 2526 2528 2529 2530 2531 2532 2533 2534 2536 2538 2540
 2542 2543 2545 2546 2548 2549 2551 2552 2553 2556 2558 2559 2

  if diff:


Accuracy:  0.5288220551378446
Top 3 Accuracy:  0.7230576441102757
Top 5 Accuracy:  0.7994987468671679
Train: [   0    1    2 ... 3657 3766 3915] Validation: [1801 1928 2480 2495 2524 2535 2547 2555 2594 2656 2746 2817 2826 2902
 2910 2918 2919 2925 2939 2964 2965 2967 2971 2980 3023 3033 3040 3046
 3048 3052 3062 3079 3093 3095 3107 3108 3112 3115 3121 3125 3128 3130
 3132 3134 3138 3140 3160 3161 3165 3166 3168 3172 3174 3176 3184 3186
 3188 3192 3194 3195 3213 3214 3215 3220 3222 3224 3229 3242 3244 3246
 3257 3258 3260 3261 3263 3265 3271 3273 3276 3279 3284 3286 3289 3292
 3295 3296 3301 3303 3304 3306 3307 3312 3313 3314 3317 3320 3325 3326
 3327 3328 3329 3330 3331 3333 3334 3335 3336 3337 3339 3340 3341 3342
 3343 3345 3346 3348 3349 3351 3352 3354 3357 3358 3359 3360 3361 3362
 3364 3365 3366 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378
 3379 3381 3382 3383 3385 3386 3387 3388 3389 3390 3392 3393 3394 3395
 3396 3397 3399 3400 3401 3402 3405 3406 3407 3408 3409 3410 3



Accuracy:  0.5267175572519084
Top 3 Accuracy:  0.7251908396946565
Top 5 Accuracy:  0.7951653944020356


  if diff:


In [28]:
import json

# with open('headers.txt', 'w') as file:
#     file.write(json.dumps(list(vect.get_feature_names())))
    
# with open('headers.txt', 'r') as file:
#     headers = eval(file.read())
    
headers_sym_error = list(vect_sym_error.get_feature_names())

In [29]:
headers_sym_error

['14061',
 '14091',
 '14093',
 '14145',
 '16002',
 '16079',
 '17001',
 '17002',
 '17005',
 '17006',
 '200000007',
 '200000009',
 '200000032',
 '200000034',
 '200001202',
 '200001203',
 '200001812',
 '200001814',
 '200001820',
 '200002203',
 '200002207',
 '200002301',
 '200002352',
 '200002354',
 '200002356',
 '200002694',
 '200002712',
 '200002714',
 '200002733',
 '200002736',
 '200002744',
 '200002772',
 '200002792',
 '200003402',
 '200003403',
 '200003404',
 '200003405',
 '200101009',
 '200101014',
 '200105010',
 '200107002',
 '200107004',
 '200107010',
 '200107011',
 '200107013',
 '200107017',
 '200107018',
 '200107022',
 '200107023',
 '200107027',
 '200107031',
 '200107035',
 '200107038',
 '200108019',
 '200108021',
 '200108022',
 '200108025',
 '200108052',
 '200108060',
 '200109007',
 '200109012',
 '200109070',
 '200109099',
 '200109100',
 '200109114',
 '200109122',
 '200109130',
 '200109135',
 '200109192',
 '200109193',
 '200109194',
 '200110026',
 '200110027',
 '200110044',
 '20

In [30]:
model_input = {
    'sr_id': '123456',
    'error_codes': [],
    'symptom': 'Customer states  Unit is froze. Cannot restart.'
}

tmp_dict = {}
for header in headers_sym_error:
    if isinstance(header, str) and header in preprocess.process_text(model_input['symptom']):
        tmp_dict[header] = 1
    elif isinstance(header, int) and header in model_input['error_codes']:
        tmp_dict[header] = 1
    else:
        tmp_dict[header] = 0

df = pd.DataFrame([tmp_dict], columns=tmp_dict.keys())


In [31]:
prediction_classes = ensemble_sym_error.classes_[np.argsort(ensemble_sym_error.predict_proba(df), axis=1)[:,-3:]][0][::-1].tolist()

In [32]:
prediction_prob = ensemble_sym_error.predict_proba(df)[0][np.argsort(ensemble_sym_error.predict_proba(df), axis=1)[:, -3:]][0][::-1].tolist()


In [33]:
res_dict = { "predictions": prediction_classes, "probabilities": prediction_prob }

to_json = json.dumps(res_dict)
to_json

'{"predictions": ["Cycle_the_Console_power_or_restart_system_software", "Reset_UPS", "Troubleshoot_HOST_COMPUTER"], "probabilities": [0.1680475432093311, 0.1098735411119253, 0.09690122184928064]}'

In [34]:
from sklearn.externals import joblib
filename = 'full_model.pkl'
joblib.dump(ensemble_sym_error, open(filename, 'wb'), protocol=2)
 
# some time later...
 

In [35]:
loaded_model2 = joblib.load(open(filename, 'rb'))

In [36]:
# load the model from disk
loaded_model = joblib.load(open(filename, 'rb'))
prediction_classes = loaded_model.classes_[np.argsort(ensemble_sym_error.predict_proba(df), axis=1)[:,-3:]][0][::-1].tolist()
prediction_prob = loaded_model.predict_proba(df)[0][np.argsort(ensemble_sym_error.predict_proba(df), axis=1)[:, -3:]][0][::-1].tolist()


res_dict = {"predictions": prediction_classes, "probabilities": prediction_prob }

to_json = json.dumps(res_dict)
to_json

'{"predictions": ["Cycle_the_Console_power_or_restart_system_software", "Reset_UPS", "No structural problem found"], "probabilities": [0.19517893594445748, 0.1300958429957999, 0.09690122184928064]}'

# Testing with Validation Set

In [37]:
validation_set = pd.read_csv("service_requests_validation_set.csv")

In [38]:
ec_validation_joined = pd.merge(validation_set,ec_list_df, 'left') 

## Merge the symptoms and error codes together

In [39]:
ec_list_mb_joined['merged_col'] = ec_list_mb_joined.symptom.str.cat(ec_list_mb_joined.error_codes, sep=' ')

In [40]:
ec_validation_joined.shape

(749, 5)

Since the service requests id are in a different format compared to the error codes in the job lib file, I am not able to merge the two. I would use the symptoms only model instead. 


In [41]:
X_test = validation_set.symptom.apply(preprocess.process_text)

In [42]:
y_test = validation_set.Resolution_Code

Use TFIDF Vectorizer to vectorize the words

In [43]:
vect_sym_validation = TfidfVectorizer(analyzer='word', vocabulary = headers_sym)

## Turn it into a data matrix
X_test_dtm = vect_sym_validation.fit_transform(X_test)

In [44]:
y_pred_class = ensemble_sym.predict(X_test_dtm)
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

## Use this if you want to get top N accuracy
y_pred_class_prob = ensemble_sym.predict_proba(X_test_dtm)
print('Top 3 Accuracy: ', top_n_accuracy(y_pred_class_prob, y_test, 3, ensemble_sym))
print('Top 5 Accuracy: ', top_n_accuracy(y_pred_class_prob, y_test, 5, ensemble_sym))

Accuracy:  0.465954606141522
Top 3 Accuracy:  0.6622162883845126
Top 5 Accuracy:  0.7356475300400535


  if diff:


In [45]:
pd.DataFrame({"1-symptom given": validation_set.symptom, "2-actual resolution code": y_test , "3-predicted resolution code": y_pred_class})

Unnamed: 0,1-symptom given,2-actual resolution code,3-predicted resolution code
0,Artifact or noisy on the system Direct connect,Reseat_ECG_monitor_cable,Troubleshoot_X_RAY_TUBE
1,DOS/ We are doing cardiac scoring applications...,Reseat_ECG_monitor_cable,Reseat_ECG_monitor_cable
2,time is off about 9 mins and affecting the str...,Reset_the_system_time_clock,Reset_the_system_time_clock
3,Tom needs serial # to gantry.,No structural problem found,No structural problem found
4,CD Tues 11/18 0800CST The time clock on this s...,No structural problem found,Reset_the_system_time_clock
5,THE CLOCK ON THE SCANNER IS OFF BY 2OMIN. HOW ...,Reset_the_system_time_clock,Reset_the_system_time_clock
6,WHEN DOING FAST CAL MESSAGE APPEARED SYSTEM ...,Troubleshoot_DETECTOR_MODULE,Troubleshoot_DETECTOR_MODULE
7,CUstomer states that there is artifacts on images,Troubleshoot_HEMIT_TANK,Clean_Collimator_Face_and_detector_face
8,TEST iLINQ DO NOT RESPOND. Image:NONE-NONE-NONE,No structural problem found,No structural problem found
9,Hardware scanner stopped during scanning Imag...,Reset_Scan_Database,Troubleshoot_X_RAY_TUBE


In [46]:
 pd.DataFrame({"1-symptom": validation_set.symptom, "2-actual": y_test , "3-predicted": y_pred_class}).to_csv("Predicted_Validation_Values.csv")