In [12]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

In [13]:
data_set = pd.read_excel('qryDK_A-Rank_Reduced_Columns.xlsx')

In [14]:
data_set.head()

Unnamed: 0,SERVPART,CONTDESC,FINAL_TARGET
0,1,SQUEAKING/KNOCKING IN THE FRONT END ALL THE TI...,0
1,11,"BEEPING SOUND WHILE DRIVING, AT ONE POINT CUST...",0
2,30,CUSTOMER FEELS PULSATION WHEN APPLYING BRAKES,0
3,30,CUSTOMER STATES THER,0
4,61,CUSTOMER STATES A/C IS BLOWING HOT AIR.,0


In [15]:
data_set.shape

(167370, 3)

**Our data set is imbalanced where claims flagged as a safety concern make up less than 8 percent of the data**

In [16]:
data_set['FINAL_TARGET'].value_counts(normalize=True)

0    0.926803
1    0.073197
Name: FINAL_TARGET, dtype: float64

### First, let's remove non-value added words (aka "stop words") in the customer complaint column

In [38]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pybokeh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [33]:
def cleanComplaint(text: str):
    table = str.maketrans({key: ' ' for key in string.punctuation})
    no_punctuation = text.translate(table)
    words = no_punctuation.split()
    stop_words = set(stopwords.words('english'))

In [39]:
stop_words = set(stopwords.words('english'))

In [51]:
stop_words.add('customer')
stop_words.add('client')
stop_words.add('advise')
stop_words.add('cus')
stop_words.add('cust')
stop_words.add('state')
stop_words.add('states')

In [52]:
stop_words

{'a',
 'about',
 'above',
 'advise',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'client',
 'couldn',
 "couldn't",
 'cus',
 'cust',
 'customer',
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',

In [None]:
def re

In [29]:
table = str.maketrans({key: ' ' for key in string.punctuation})
words = data_set['CONTDESC'][0].translate(table)

In [30]:
words

'SQUEAKING KNOCKING IN THE FRONT END ALL THE TIME  NOT SURE WHATS GOING PLEASE DIAG AND ADVISE  WHEN APPLYING BRAKES'

In [10]:
data_set = data_set.assign(CONTDESC_CLEANED=data_set['CONTDESC'].apply(lambda w: w for w in word_tokenize(example_sent) if not w in set(stopwords.words('english')) )

SyntaxError: unexpected EOF while parsing (<ipython-input-10-853e847fb580>, line 1)

### Separate the features from the target

In [73]:
X = data_set[['SERVPART','CONTDESC']]
y = data_set[['FINAL_TARGET']]

### Need to [one-hot encode](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) the part # column

**NOTE:** Starting with scikit-learn version 0.20, no longer need to perform a 2-step process of performing label encoding and then one-hot encode with categorical, nominal data.  Now, only need to do one-hot encoding.

In [102]:
enc_onehot = OneHotEncoder(categories='auto')

### scikit-learn gotcha: Data must be 2-D

In [75]:
part5 = X[['SERVPART']]  # Force a single column dataframe using bracket syntax

In [103]:
part5_onehot = enc_onehot.fit_transform(part5)

Our part5 which is now one-hot encoded, is a compressed sparse matrix

In [104]:
type(part5_onehot)

scipy.sparse.csr.csr_matrix

In [105]:
part5_onehot.data.size

167370

In [106]:
part5_onehot.data.shape

(167370,)

In [107]:
part5_onehot.data.ndim

1

### Next, encode the customer complaint text column

In [108]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

In [109]:
complaint = X['CONTDESC']

In [110]:
complaint_cvect = count_vect.fit_transform(complaint)

In [111]:
type(complaint_cvect)

scipy.sparse.csr.csr_matrix

In [112]:
complaint_tfidf = tfidf_transformer.fit_transform(complaint_cvect)

In [113]:
type(complaint_tfidf)

scipy.sparse.csr.csr_matrix

In [114]:
complaint_tfidf.shape

(167370, 28263)

### Prepare test and training data sets

In [115]:
df_train, df_test = train_test_split(data_set, test_size = 0.5, random_state = 12)

In [116]:
df_train.shape

(83685, 3)

In [117]:
df_test.shape

(83685, 3)

In [118]:
df_train.shape[0] + df_test.shape[0]

167370

In [119]:
df_train['FINAL_TARGET'].value_counts(normalize=True)

0    0.926964
1    0.073036
Name: FINAL_TARGET, dtype: float64

In [120]:
df_train['FINAL_TARGET'].value_counts()

0    77573
1     6112
Name: FINAL_TARGET, dtype: int64

### Upsample the minority class

In [121]:
from sklearn.utils import resample

df_train_upsampled = resample(df_train.query("FINAL_TARGET == 1"), # filter to minority class
                                    replace=True, 
                                    n_samples = df_train.query("FINAL_TARGET == 0").shape[0], 
                                    random_state = 321)

In [122]:
df_train_upsampled['FINAL_TARGET'].value_counts()

1    77573
Name: FINAL_TARGET, dtype: int64

In [123]:
df_train_balanced = pd.concat([df_train.query("FINAL_TARGET == 0"), df_train_upsampled])

In [124]:
df_train_balanced['FINAL_TARGET'].value_counts()

1    77573
0    77573
Name: FINAL_TARGET, dtype: int64

### Encode training part 5 data

In [125]:
part5_train = df_train_balanced[['SERVPART']]

In [126]:
part5_train_onehot = enc_onehot.transform(part5_train)  # perform transform(), NOT fit_transform()!

In [127]:
type(part5_train_onehot)

scipy.sparse.csr.csr_matrix

In [128]:
part5_train_onehot.shape

(155146, 3051)

### Encode training complaint data

In [129]:
complaint_train = df_train_balanced['CONTDESC']

In [130]:
complaint_train_cvect = count_vect.transform(complaint_train)

In [131]:
complaint_train_tfidf = tfidf_transformer.transform(complaint_train_cvect)

In [132]:
type(complaint_train_tfidf)

scipy.sparse.csr.csr_matrix

In [133]:
complaint_train_tfidf.shape

(155146, 28263)

### Assert or test to see if part 5 training matrix has same number of rows as the complaint training matrix

In [134]:
assert part5_train_onehot.shape[0] == complaint_train_tfidf.shape[0]

### Combine or concatenate the part5 training data with the complaint training data

In [135]:
X_train = sparse.hstack((part5_train_onehot, complaint_train_tfidf), format='csr')

In [136]:
X_train.shape

(155146, 31314)

### Define our y / target 1-D array:

In [137]:
y_train = df_train_balanced['FINAL_TARGET'].values

In [138]:
y_train.shape

(155146,)

### Assert or check to ensure that the training matrix has same number of rows as the target vector

In [139]:
assert X_train.shape[0] == y_train.shape[0]

### Fit classification model to training data

In [140]:
clf = MultinomialNB().fit(X_train, y_train)

### Encode the part 5 test data

In [141]:
part5_test = df_test[['SERVPART']]

In [142]:
part5_test_onehot = enc_onehot.transform(part5_test)

In [143]:
part5_test_onehot.shape

(83685, 3051)

### Encode the complaint test data

In [144]:
complaint_test = df_test['CONTDESC']

In [145]:
complaint_test_cvect = count_vect.transform(complaint_test)

In [146]:
complaint_test_tfidf = tfidf_transformer.transform(complaint_test_cvect)

In [147]:
complaint_test_tfidf.shape

(83685, 28263)

### Combine or concatenate the part 5 test matrix with the complaint test matrix

In [148]:
X_test = sparse.hstack((part5_test_onehot, complaint_test_tfidf), format='csr')

### Define test target vector y

In [149]:
y_test = df_test['FINAL_TARGET'].values

In [150]:
y_test.shape

(83685,)

In [151]:
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.900


### Predict on sample data

In [214]:
part_test = np.array(['32100'])
complaint_test = np.array(['ENGINE RUNS ROUGH'])

In [215]:
X_part5_encode = enc_onehot.transform(part_test.reshape(-1,1))  # create 2-D array

In [216]:
X_complaint_cvect = count_vect.transform(complaint_test)
X_complaint_tfidf = tfidf_transformer.transform(X_complaint_cvect)

In [217]:
X_concat = sparse.hstack((X_part5_encode, X_complaint_tfidf), format='csr')

In [218]:
predicted = clf.predict(X_concat)
print(predicted)

[0]
