In [1]:
#natural language processing to predict field failure modes

In [2]:
import pandas as pd

In [3]:
import seaborn as sns

In [4]:
from sklearn.cross_validation import train_test_split

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [6]:
from sklearn.naive_bayes import MultinomialNB

In [7]:
from sklearn import metrics

In [8]:
import numpy as np

In [9]:
from textblob import TextBlob, Word

In [10]:
from nltk.stem.snowball import SnowballStemmer

In [11]:
#define fuctions

In [12]:
def word_tokenize(text, how = 'lemma'):
    words = TextBlob(text).words
    if how == 'lemma':
        return [word.lemmatize() for word in words]
    elif how == 'stem':
        return [stemmer.stem(word) for word in words]

In [13]:
#read in csv file

In [14]:
df = pd.read_csv('C:\Users\sglembocki\Documents\Excel Files\FaultModeNaturalLanguageProcessingRev2.csv') #think about making everything lowercase

In [15]:
#delete unused fields

In [16]:
del df['JobID']

In [17]:
del df['CorrectionCode']

In [18]:
df = df.dropna()

In [19]:
#create a new field that combines all other features of interest

In [20]:
df['All'] = df['ComplaintNarrative'] + ' ' + df['CauseDescription'] + ' ' + df['CauseNarrative'] + ' ' + df['CorrectionDescription'] + ' ' + df['CorrectionNarrative'] + ' ' + df['RollupSymptom']

In [21]:
df.head()

Unnamed: 0,ComplaintNarrative,CauseDescription,CauseNarrative,CorrectionDescription,CorrectionNarrative,FaultModeID,FailureMode,RollupSymptom,All
0,Customer states: Customer has advised since th...,No Fault Found / Unable To Replicate,Customer education.,Rear Motor General Diagnosis,Hill Start Assist automatically engages the\nb...,5388,Unable to Duplicate Concern,Vehicle rolled backwards while in Drive and on...,Customer states: Customer has advised since th...
1,Customer states: Vehicle suspension is noisy,No Fault Found / Unable To Replicate,"Checked vehicle suspension system, no problems...",Perform Validation Test Drive,Performed validation test drive.,5388,Unable to Duplicate Concern,Vehicle suspension is noisy,Customer states: Vehicle suspension is noisy N...
2,Customer states: Vehicle suspension is noisy,No Fault Found / Unable To Replicate,"Checked vehicle suspension system, no problems...",Front Suspension (including Hubs) General Diag...,"Checked vehicle suspension system, no problems...",5388,Unable to Duplicate Concern,Vehicle suspension is noisy,Customer states: Vehicle suspension is noisy N...
3,Customer states vehicle charges for a couple m...,No Fault Found / Unable To Replicate,Customer is having charging issues and cause i...,Charge System Inlet General Diagnosis Conclusi...,Vehicle is experiencing charging issues. Cause...,5388,Unable to Duplicate Concern,Vehicle charges intermittently,Customer states vehicle charges for a couple m...
4,Customer states vehicle charges for a couple m...,No Fault Found / Unable To Replicate,Customer is having charging issues and cause i...,Master Charger - RH - 1st Generation,Charger not installed due to it not being deem...,5388,Unable to Duplicate Concern,Vehicle charges intermittently,Customer states vehicle charges for a couple m...


In [22]:
#decode feature column

In [23]:
df['CorrectionNarrativeRev2'] = df['All'].str.decode('utf-8', errors = 'ignore')

In [24]:
df.head()

Unnamed: 0,ComplaintNarrative,CauseDescription,CauseNarrative,CorrectionDescription,CorrectionNarrative,FaultModeID,FailureMode,RollupSymptom,All,CorrectionNarrativeRev2
0,Customer states: Customer has advised since th...,No Fault Found / Unable To Replicate,Customer education.,Rear Motor General Diagnosis,Hill Start Assist automatically engages the\nb...,5388,Unable to Duplicate Concern,Vehicle rolled backwards while in Drive and on...,Customer states: Customer has advised since th...,Customer states: Customer has advised since th...
1,Customer states: Vehicle suspension is noisy,No Fault Found / Unable To Replicate,"Checked vehicle suspension system, no problems...",Perform Validation Test Drive,Performed validation test drive.,5388,Unable to Duplicate Concern,Vehicle suspension is noisy,Customer states: Vehicle suspension is noisy N...,Customer states: Vehicle suspension is noisy N...
2,Customer states: Vehicle suspension is noisy,No Fault Found / Unable To Replicate,"Checked vehicle suspension system, no problems...",Front Suspension (including Hubs) General Diag...,"Checked vehicle suspension system, no problems...",5388,Unable to Duplicate Concern,Vehicle suspension is noisy,Customer states: Vehicle suspension is noisy N...,Customer states: Vehicle suspension is noisy N...
3,Customer states vehicle charges for a couple m...,No Fault Found / Unable To Replicate,Customer is having charging issues and cause i...,Charge System Inlet General Diagnosis Conclusi...,Vehicle is experiencing charging issues. Cause...,5388,Unable to Duplicate Concern,Vehicle charges intermittently,Customer states vehicle charges for a couple m...,Customer states vehicle charges for a couple m...
4,Customer states vehicle charges for a couple m...,No Fault Found / Unable To Replicate,Customer is having charging issues and cause i...,Master Charger - RH - 1st Generation,Charger not installed due to it not being deem...,5388,Unable to Duplicate Concern,Vehicle charges intermittently,Customer states vehicle charges for a couple m...,Customer states vehicle charges for a couple m...


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53343 entries, 0 to 95257
Data columns (total 10 columns):
ComplaintNarrative         53343 non-null object
CauseDescription           53343 non-null object
CauseNarrative             53343 non-null object
CorrectionDescription      53343 non-null object
CorrectionNarrative        53343 non-null object
FaultModeID                53343 non-null int64
FailureMode                53343 non-null object
RollupSymptom              53343 non-null object
All                        53343 non-null object
CorrectionNarrativeRev2    53343 non-null object
dtypes: int64(1), object(9)
memory usage: 4.5+ MB


In [26]:
#null accuracy

In [27]:
null = df.FailureMode.value_counts() / df.shape[0]

In [28]:
null.head(2)

Unable to Duplicate Concern                                          0.088503
Falcon Door Inductive Sensors - Detect Object That Is Not Present    0.037981
Name: FailureMode, dtype: float64

In [29]:
#split into training and testing sets

In [30]:
x_train, x_test, y_train, y_test = train_test_split(df.CorrectionNarrativeRev2, df.FaultModeID, random_state = 1)

In [31]:
#learn the vocabulary of the training data

In [32]:
x_train.head()

49536    Customer states: Plastic piece on driver side ...
24362    Customer states: wind noise and  seal coming l...
87267    Door upper glass has to be corrected Poorly Fi...
93267    Customer states: Customer states alert for Dri...
12230    Customer states: Autopilot Summon function doe...
Name: CorrectionNarrativeRev2, dtype: object

In [33]:
#instantiate count vectorizer (stop_words not working)

In [34]:
vect = CountVectorizer(stop_words = 'english', analyzer = lambda x: word_tokenize(x, how = 'lemma')) #troubleshoot why the stemmer is not working

In [35]:
x_train_dtm = vect.fit_transform(x_train)

In [36]:
#examine the vocabulary and document-term matrix together

In [37]:
train_arr = x_train_dtm.toarray()

In [38]:
train_features = vect.get_feature_names()

In [52]:
pd.DataFrame(x_train_dtm.toarray(), columns = vect.get_feature_names()).head() #remove quotations, delete non-english claims (??)

Unnamed: 0,''12,''12V,''12v,''Fahrzeug,''Schlsselbatterien,''To,''Vehicle,''key,'1,'12V,...,zustzliche,zustzlichen,zuweit,zwecks,zwei,zweigeteilt,zweit,zweite,zweiten,zwischen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
#transform testing data into a document term matrix

In [54]:
x_test.head()

24394    Customer states: Alert appeared: Park Assist D...
8322     Customer states: Rear parking sensor is pushed...
85775    Customer states:  Falcon Door Interior Trim ma...
65646    Customer states: Black headliner has a hole in...
81864    Customer states: Alert appeared: 12V Battery N...
Name: CorrectionNarrativeRev2, dtype: object

In [55]:
x_test_dtm = vect.transform(x_test)

In [57]:
pd.DataFrame(x_test_dtm.toarray(), columns = vect.get_feature_names()).head()

Unnamed: 0,''12,''12V,''12v,''Fahrzeug,''Schlsselbatterien,''To,''Vehicle,''key,'1,'12V,...,zustzliche,zustzlichen,zuweit,zwecks,zwei,zweigeteilt,zweit,zweite,zweiten,zwischen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
x_train_dtm.shape

(40007, 29357)

In [59]:
x_test_dtm.shape

(13336, 29357)

In [60]:
#create df with count and token

In [61]:
x_train_token_counts = pd.DataFrame({'Token': train_features, 'Count': np.sum(train_arr, axis = 0)})

In [62]:
x_train_token_counts.sort_index(by = 'Count', ascending = False).head() #stop words still exist

  if __name__ == '__main__':


Unnamed: 0,Count,Token
27157,93227,the
13491,87233,and
27356,71989,to
16923,58080,door
20180,40647,is


In [63]:
#instantiate the model

In [64]:
nb = MultinomialNB()

In [65]:
nb.fit(x_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [66]:
predictions = nb.predict(x_test_dtm)

In [67]:
predictions

array([ 1051,  5626,  2373, ..., 10083,  1051, 10083], dtype=int64)

In [68]:
#model accuracy

In [69]:
metrics.accuracy_score(y_test, predictions)

0.33878224355128972