In [46]:
#load dependencies
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import train_test_split

In [47]:
df = pd.read_excel('../data/interim/undersampling.xlsx')

In [48]:
df

Unnamed: 0,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,SW,TEPE,VMG,OTH
0,To provide training opportunities that align w...,1,0,0,0,0,0,0,0,0,0,1,0
1,create opportunities for SW's to move laterall...,1,0,0,0,0,1,0,0,0,1,0,0
2,I feel its critical to have ongoing training a...,1,0,0,0,0,0,0,0,0,0,1,0
3,feel upset against those supervisors and the s...,1,0,0,1,0,0,0,0,0,0,0,0
4,Learn to value and treat auxiliary employees w...,1,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3278,"Transparent work planning. Meeting agendas, ac...",0,0,0,0,0,0,0,0,0,0,0,1
3279,I struggle with some of my answers because I d...,0,0,0,0,0,0,0,0,1,0,0,1
3280,"Ensuring that IT issues, access and upgrades a...",0,0,0,0,0,0,0,0,0,1,0,1
3281,A more effective and informed group to support...,0,0,0,0,0,0,0,0,0,0,0,1


In [49]:
X = df['Comment']
y = df.drop(columns=['Comment'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Training data

#### Labels

In [50]:
y_train = np.array(y_train)
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [51]:
theme_names = y.columns
themes_ytrain = theme_names
themes_ytrain

Index(['CPD', 'CB', 'EWC', 'Exec', 'FEW', 'SP', 'RE', 'Sup', 'SW', 'TEPE',
       'VMG', 'OTH'],
      dtype='object')

#### TF-IDF

In [52]:
tfid = TfidfVectorizer() 
X_train = tfid.fit_transform(X_train)

In [53]:
X_train

<2626x7348 sparse matrix of type '<class 'numpy.float64'>'
	with 87009 stored elements in Compressed Sparse Row format>

### Test Data

#### Lables

In [54]:
y_test = np.array(y_test)
y_test

array([[1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

#### TF-IDF

In [55]:
X_test = tfid.transform(X_test)
X_test

<657x7348 sparse matrix of type '<class 'numpy.float64'>'
	with 20350 stored elements in Compressed Sparse Row format>

## Models

In [38]:
#Parts of code adapated from DSCI 573 lab 4
#Dictionary of Base Models

models = {
    'LinearSVC': LinearSVC(),
    'MultinomialNB' : MultinomialNB(),
    'GaussianNB' : GaussianNB()#,
    #'Random Forest' : RandomForestClassifier(), too slow will use function 
    #'KNeighborsClassifier': KNeighborsClassifier(),
    #'Neural Net' : MLPClassifier()
}

In [39]:
#For themes only
#Note takes about ~15 min to run
results_dict_themes = []

for model_name, model in models.items():

    classifier_chain = ClassifierChain(model)
    
    model = classifier_chain.fit(X_train, y_train)
    
    train = model.score(X_train, y_train)
    valid = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    recall = recall_score(y_test, y_pred, average= 'micro')
    precision = precision_score(y_test, y_pred, average= 'micro')
    
    case= {'Model': model_name,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    results_dict_themes.append(case)

In [40]:
pd.DataFrame(results_dict_themes)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.948972,0.363775,0.474561,0.743478
1,MultinomialNB,0.054455,0.024353,0.015726,1.0
2,GaussianNB,0.644326,0.048706,0.33025,0.170895


## MEKA Multi-Label Classifications

In [56]:
from skmultilearn.ext import download_meka
from skmultilearn.ext import Meka
from sklearn.metrics import hamming_loss

meka_classpath = download_meka()
meka_classpath

MEKA 1.9.2 found, not downloading


'/Users/karan/scikit_ml_learn_data/meka/meka-release-1.9.2/lib/'

In [20]:
meka = Meka(
        meka_classifier = "meka.classifiers.multilabel.BR", # Binary Relevance
        weka_classifier = "weka.classifiers.bayes.NaiveBayesMultinomial", # with Naive Bayes single-label classifier
        meka_classpath = meka_classpath, #obtained via download_meka
        java_command = '/usr/bin/java' # path to java executable
)
meka

Meka(java_command='/usr/bin/java',
     meka_classifier='meka.classifiers.multilabel.BR',
     meka_classpath='/Users/karan/scikit_ml_learn_data/meka/meka-release-1.9.2/lib/',
     weka_classifier='weka.classifiers.bayes.NaiveBayesMultinomial')

In [28]:
meka.fit(X_train, y_train)

Meka(java_command='/usr/bin/java',
     meka_classifier='meka.classifiers.multilabel.BR',
     meka_classpath='/Users/karan/scikit_ml_learn_data/meka/meka-release-1.9.2/lib/',
     weka_classifier='weka.classifiers.bayes.NaiveBayesMultinomial')

In [29]:
predictions = meka.predict(X_test)

In [31]:
hamming_loss(y_test, predictions)

0.1784627092846271

In [33]:
train = meka.score(X_train, y_train)
valid = meka.score(X_test, y_test)
y_pred = meka.predict(X_test)
recall = recall_score(y_test, y_pred, average= 'micro')
precision = precision_score(y_test, y_pred, average= 'micro')

In [37]:
pd.DataFrame({
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}, index=[0]
    )

Unnamed: 0,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,0.295506,0.17656,0.360777,0.352622


### CC

#### ADA Boost

In [45]:
meka = Meka(
        meka_classifier = "meka.classifiers.multilabel.CC", # Binary Relevance
        weka_classifier = "weka.classifiers.meta.AdaBoostM1",
        meka_classpath = meka_classpath, #obtained via download_meka
        java_command = '/usr/bin/java' # path to java executable
)
meka

Meka(java_command='/usr/bin/java',
     meka_classifier='meka.classifiers.multilabel.CC',
     meka_classpath='/Users/karan/scikit_ml_learn_data/meka/meka-release-1.9.2/lib/',
     weka_classifier='weka.classifiers.meta.AdaBoostM1')

In [46]:
meka.fit(X_train, y_train)
predictions = meka.predict(X_test)

In [47]:
train = meka.score(X_train, y_train)
valid = meka.score(X_test, y_test)
y_pred = meka.predict(X_test)
recall = recall_score(y_test, y_pred, average= 'micro')
precision = precision_score(y_test, y_pred, average= 'micro')
pd.DataFrame({
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}, index=[0]
    )

Unnamed: 0,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,0.130998,0.138508,0.235893,0.770393


#### Bagging

In [48]:
meka = Meka(
        meka_classifier = "meka.classifiers.multilabel.CC", # Binary Relevance
        weka_classifier = "weka.classifiers.meta.Bagging",
        meka_classpath = meka_classpath, #obtained via download_meka
        java_command = '/usr/bin/java' # path to java executable
)
meka

Meka(java_command='/usr/bin/java',
     meka_classifier='meka.classifiers.multilabel.CC',
     meka_classpath='/Users/karan/scikit_ml_learn_data/meka/meka-release-1.9.2/lib/',
     weka_classifier='weka.classifiers.meta.Bagging')

In [49]:
meka.fit(X_train, y_train)
predictions = meka.predict(X_test)

KeyboardInterrupt: 

In [None]:
train = meka.score(X_train, y_train)
valid = meka.score(X_test, y_test)
y_pred = meka.predict(X_test)
recall = recall_score(y_test, y_pred, average= 'micro')
precision = precision_score(y_test, y_pred, average= 'micro')
pd.DataFrame({
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}, index=[0]
    )

#### BayesNet

In [50]:
meka = Meka(
        meka_classifier = "meka.classifiers.multilabel.CC", # Binary Relevance
        weka_classifier = "weka.classifiers.bayes.BayesNet",
        meka_classpath = meka_classpath, #obtained via download_meka
        java_command = '/usr/bin/java' # path to java executable
)
meka

Meka(java_command='/usr/bin/java',
     meka_classifier='meka.classifiers.multilabel.CC',
     meka_classpath='/Users/karan/scikit_ml_learn_data/meka/meka-release-1.9.2/lib/',
     weka_classifier='weka.classifiers.bayes.BayesNet')

In [51]:
meka.fit(X_train, y_train)
predictions = meka.predict(X_test)

In [53]:
train = meka.score(X_train, y_train)
valid = meka.score(X_test, y_test)
y_pred = meka.predict(X_test)
recall = recall_score(y_test, y_pred, average= 'micro')
precision = precision_score(y_test, y_pred, average= 'micro')
pd.DataFrame({
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}, index=[0]
    )

Unnamed: 0,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,0.308835,0.290715,0.53099,0.672131


#### meka.classifiers.multilabel.LC

In [57]:
meka = Meka(
        meka_classifier = "meka.classifiers.multilabel.LC", # Binary Relevance
        #weka_classifier = "weka.classifiers.bayes.BayesNet",
        meka_classpath = meka_classpath, #obtained via download_meka
        java_command = '/usr/bin/java' # path to java executable
)
meka

Meka(java_command='/usr/bin/java',
     meka_classifier='meka.classifiers.multilabel.LC',
     meka_classpath='/Users/karan/scikit_ml_learn_data/meka/meka-release-1.9.2/lib/',
     weka_classifier=None)

In [58]:
meka.fit(X_train, y_train)
predictions = meka.predict(X_test)

In [59]:
train = meka.score(X_train, y_train)
valid = meka.score(X_test, y_test)
y_pred = meka.predict(X_test)
recall = recall_score(y_test, y_pred, average= 'micro')
precision = precision_score(y_test, y_pred, average= 'micro')
pd.DataFrame({
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}, index=[0]
    )

Unnamed: 0,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,0.624905,0.200913,0.345976,0.395349


## Glove on this

In [3]:
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv1D, Conv2D, MaxPooling2D, GlobalMaxPooling1D, MaxPool1D, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM
from keras.utils import to_categorical
from keras import backend as K
#from tensorflow.keras import backend as K


import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import spacy
# from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vect=Tokenizer()
vect.fit_on_texts(df['Comment'])
vocab_size = len(vect.word_index) + 1
print(vocab_size)

8324


In [7]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/Users/karan/Downloads/glove/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [8]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in vect.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [80]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.18970001,  0.050024  ,  0.19084001, ..., -0.39804   ,
         0.47646999, -0.15983   ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       ...,
       [ 0.64234   ,  0.47938001,  0.39046001, ...,  0.28545001,
         0.29418001,  0.37436   ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.17636   ,  0.55317998, -0.31927001, ..., -0.49941   ,
         0.58595997, -0.15044001]])

In [81]:
X = df[['Comment']]
y = df.drop(columns=['Comment'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [82]:
def max_len(x):
    a=x.split()
    return len(a)

max_len = max(df['Comment'].apply(max_len))
max_len

146

In [83]:
encoded_docs_train = vect.texts_to_sequences(X_train['Comment'])
max_length = vocab_size
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_len, padding='post')
print(padded_docs_train)

[[ 346  551  613 ...    0    0    0]
 [  90   36 2263 ...    0    0    0]
 [ 436   18  894 ...    0    0    0]
 ...
 [ 697    1   15 ...    0    0    0]
 [ 643    9   77 ...    0    0    0]
 [   5  348 8178 ...    0    0    0]]


In [84]:
y_train = np.array(y_train)

In [85]:
max_features = embedding_matrix.shape[0]
maxlen = max_len
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 10
embed_size = 100 # for glove we are using 100d dataset
n_class = 12

In [86]:
model = Sequential()

model.add(Embedding(max_features, embed_size, weights=[embedding_matrix],
                        trainable=False, input_length=maxlen))

model.add(Dropout(0.1))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
                 strides=1))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(hidden_dims, activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(n_class, activation = 'sigmoid'))


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 146, 100)          832400    
_________________________________________________________________
dropout_12 (Dropout)         (None, 146, 100)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 144, 250)          75250     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 72, 250)           0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 70, 250)           187750    
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 35, 250)           0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 8750)              0         
__________

In [87]:
opt = keras.optimizers.Adam(lr=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

  return array(obj, copy=False)


In [88]:
# Train Model
model.fit(padded_docs_train, y_train, batch_size=batch_size, epochs=5, class_weight=weights)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ff2995aaf10>

### Precision & Recall

In [89]:
# creating padded dataset for x_valid
encoded_docs_valid = vect.texts_to_sequences(X_test['Comment'])
padded_docs_valid = pad_sequences(encoded_docs_valid, maxlen=max_len, padding='post')

In [90]:
y_test = np.array(y_test)

In [91]:
score = model.evaluate(padded_docs_valid,y_test)
score



[0.38082090328636053, 0.8675799322636342]

In [92]:
pred = model.predict(padded_docs_valid, batch_size=batch_size, verbose=1)



In [93]:
pred

array([[0.17814445, 0.19023477, 0.11674318, ..., 0.8478421 , 0.21039556,
        0.18262035],
       [0.04306564, 0.19953047, 0.05309377, ..., 0.11371254, 0.01344639,
        0.0858386 ],
       [0.09085   , 0.12003379, 0.05861139, ..., 0.26561162, 0.17625782,
        0.12425753],
       ...,
       [0.05833021, 0.13478012, 0.04545392, ..., 0.08720328, 0.01749544,
        0.11164635],
       [0.02363847, 0.02747857, 0.02640555, ..., 0.81004304, 0.02224762,
        0.07692412],
       [0.16842692, 0.10970025, 0.11133917, ..., 0.03868888, 0.20595995,
        0.10622642]], dtype=float32)

In [94]:
from sklearn.metrics import precision_score, recall_score, f1_score

predictions = pred
thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

for val in thresholds:
    print("For threshold: ", val)
    pred=predictions.copy()
  
    pred[pred>=val]=1
    pred[pred<val]=0
  
    precision = precision_score(y_test, pred, average='micro')
    recall = recall_score(y_test, pred, average='micro')
    f1 = f1_score(y_test, pred, average='micro')
   
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
    print('\n')

For threshold:  0.1
Micro-average quality numbers
Precision: 0.1709, Recall: 0.8196, F1-measure: 0.2829


For threshold:  0.2
Micro-average quality numbers
Precision: 0.2347, Recall: 0.4246, F1-measure: 0.3023


For threshold:  0.3
Micro-average quality numbers
Precision: 0.2975, Recall: 0.1841, F1-measure: 0.2274


For threshold:  0.4
Micro-average quality numbers
Precision: 0.4545, Recall: 0.0925, F1-measure: 0.1537


For threshold:  0.5
Micro-average quality numbers
Precision: 0.6796, Recall: 0.0648, F1-measure: 0.1182


For threshold:  0.6
Micro-average quality numbers
Precision: 0.7625, Recall: 0.0564, F1-measure: 0.1051


For threshold:  0.7
Micro-average quality numbers
Precision: 0.8060, Recall: 0.0500, F1-measure: 0.0941


For threshold:  0.8
Micro-average quality numbers
Precision: 0.8269, Recall: 0.0398, F1-measure: 0.0759


For threshold:  0.9
Micro-average quality numbers
Precision: 0.8667, Recall: 0.0241, F1-measure: 0.0468




#### LSTM using Glove

In [66]:
model = Sequential()

# Configuring the parameters
model.add(Embedding(max_features, 100, input_length=max_len, weights=[embedding_matrix], trainable=False))

model.add(LSTM(120, return_sequences=True))  

# Adding a dropout layer
model.add(Dropout(0.1))
model.add(Dense(32, activation='sigmoid'))

model.add(Dense(n_class, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 146, 100)          832400    
_________________________________________________________________
lstm_3 (LSTM)                (None, 146, 120)          106080    
_________________________________________________________________
dropout_9 (Dropout)          (None, 146, 120)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                47360     
_________________________________________________________________
dense_8 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_9 (Dense)              (None, 12)                396       
Total params: 988,316
Trainable params: 155,916
Non-trainable params: 832,400
________________________________________________________________

In [67]:
opt = keras.optimizers.Adam(lr=0.01)

In [68]:
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [69]:
# Train Model
model.fit(padded_docs_train, y_train, batch_size=batch_size, epochs=2, class_weight='auto')

Epoch 1/2


INFO:plaidml:Analyzing Ops: 1905 of 28322 operations complete
INFO:plaidml:Analyzing Ops: 6663 of 28322 operations complete
INFO:plaidml:Analyzing Ops: 11109 of 28322 operations complete
INFO:plaidml:Analyzing Ops: 18410 of 28322 operations complete
INFO:plaidml:Analyzing Ops: 24215 of 28322 operations complete




KeyboardInterrupt: 

INFO:plaidml:Analyzing Ops: 7102 of 28322 operations complete
INFO:plaidml:Analyzing Ops: 11518 of 28322 operations complete
INFO:plaidml:Analyzing Ops: 17816 of 28322 operations complete
INFO:plaidml:Analyzing Ops: 24034 of 28322 operations complete


Epoch 2/2


<keras.callbacks.History at 0x7ff2e4ca0990>

In [70]:
from sklearn.utils import class_weight

In [75]:
weights = class_weight.compute_sample_weight('balanced', y_train)