# Comment Effectiveness with AI

In [1]:

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import accuracy_score

from sklearn.utils import resample
from gensim.models import KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D,Bidirectional,SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dropout

from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
#cufflinks.go_offline()
#cufflinks.set_config_file(world_readable=True, theme='pearl')





ModuleNotFoundError: No module named 'keras'

In [None]:

df = pd.read_excel('Comments_Final.xlsx',sheet_name='Quarterly Checkin Feedback',header=0,converters={"comments":str,"quality":str})
df.dropna(inplace=True)
df.dtypes
df.head()



In [None]:

df.index = range(10405)
df['comments'].apply(lambda x: len(x.split(' '))).sum()
df['quality'] = df['quality'].replace({'effective':'Effective', 'Effective':'Effective'})



In [None]:

df.quality.value_counts()



In [None]:

df_majority = df[df.quality=="Ineffective"]
df_minority = df[df.quality=="Effective"]

df_majority_downsampled = resample(df_majority, 
                                 replace=True,     # sample with replacement
                                 n_samples=2592) # reproducible results

df1 = pd.concat([df_minority, df_majority_downsampled])

df1.quality.value_counts()


cnt_pro = df1['quality'].value_counts()
plt.figure(figsize=(12,4))
sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Quality', fontsize=12)
plt.xticks(rotation=90)
plt.show();



In [None]:

df['word_count'] = df['comments'].apply(lambda x: len(str(x).split()))

df.loc[df['word_count'] < 30, 'quality'] = "Ineffective"



In [None]:

EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'



In [None]:

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)



In [None]:

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,]')
BAD_SYMBOLS_RE = re.compile('[^a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = text.strip()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub(' ', text)
    text = re.sub('&amp','',text)
    text = re.sub('\s+',' ',text).strip()
    return text
df['comments'] = df['comments'].apply(clean_text)
df['comments'] = df['comments'].str.replace('\d+', '')



In [None]:

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens if len(word)>2 and (word not in STOPWORDS)] 
    return lemmas

df['comments'] = df['comments'].apply(lemmatize_word)



In [None]:

MAX_NB_WORDS = 20000
# Max number of words in each comment
MAX_SEQUENCE_LENGTH = 500
# This is fixed.
EMBEDDING_DIM = 300
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['comments'].values)



In [None]:

#import pickle



In [None]:

#with open('tokenizer.pickle', 'wb') as handle:
#    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [None]:

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



In [None]:

X = tokenizer.texts_to_sequences(df['comments'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH,padding='post',truncating='post')
print(X)
print('Shape of one feature tensor:', X.shape)



In [None]:

Y = pd.get_dummies(df['quality']).values
print(Y)
print('Shape of one label tensor:', Y.shape)



In [None]:

nb_words = min(MAX_NB_WORDS, len(word_index))+1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
embedding_matrix



In [None]:

embedding_matrix.shape



In [None]:

for (word, idx) in word_index.items():
    if word in word2vec.vocab and idx < MAX_NB_WORDS:
        embedding_matrix[idx] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))



In [None]:

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)



In [None]:

#Simplest model possible, linear in nature, multiple layers can be added in the order of their computation, to make it a multi-layer perceptron.
model = Sequential()

#We load this embedding matrix into an Embedding layer. Note that we set trainable=False to prevent the weights from being updated during training.
model.add(Embedding(len(embedding_matrix), EMBEDDING_DIM, weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
#While 2D CNNs are used for image and video processing, 1D CNNs are used for natural language processing (NLP)
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(300)))
model.add(Dropout(0.2))

model.add(Dense(2, activation='sigmoid'))



In [None]:

model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
print(model.summary())




<h1 id="Training-Model-here">Training Model here<a class="anchor-link" href="#Training-Model-here">¶</a></h1>


In [None]:

early_stop = EarlyStopping(monitor='val_loss', patience=3,min_delta=0.0001)
model_checkpoint = ModelCheckpoint('C:\\Users\\769005\\Desktop\\GoPerform AI\\LSTM MODEL\\qcheckin2019.h5',monitor='val_loss',save_best_only=True,save_weights_only=True)

hist = model.fit(X_train, Y_train, \
        validation_split=0.1, \
        epochs=5, batch_size=64, shuffle=True, \
        callbacks=[early_stop,model_checkpoint])
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))



In [None]:

y_hat = model.predict(X_test)



In [None]:

accuracy_score(list(map(lambda x: np.argmax(x), Y_test)), list(map(lambda x: np.argmax(x), y_hat)))



In [None]:

print('Testing accuracy: %s' % accuracy_score(list(map(lambda x: np.argmax(x), Y_test)), list(map(lambda x: np.argmax(x), y_hat))))
print('Testing F1 score: {}'.format(f1_score(list(map(lambda x: np.argmax(x), Y_test)), list(map(lambda x: np.argmax(x), y_hat))), average='weighted'))
print('Classification Report:')
print(classification_report(list(map(lambda x: np.argmax(x), Y_test)), list(map(lambda x: np.argmax(x), y_hat))))




<h1 id="Model-achieves-85%-accuracy">Model achieves 85% accuracy<a class="anchor-link" href="#Model-achieves-85%-accuracy">¶</a></h1>


In [None]:

plt.figure(figsize=(8,7))
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()



In [None]:

plt.figure(figsize=(8,7))
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()



In [None]:

from keras.models import load_model
model = load_model('Word2Vec-Balanced-ExtrCleaned-84%.h5')



In [None]:

model.save('Word2Vec-Balanced-ExtrCleaned-85%.h5')



In [None]:

df8 = pd.read_excel('Q2_CheckIn_750CharEnforced_Raw&Clean_Word&CharCounts v2.xlsx',header=4,converters={"comments":str},sheet_name='Sheet1')
#df8.dropna(inplace=True)



In [None]:

df8



In [None]:

seq = tokenizer.texts_to_sequences(df8['comments'])
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ["Effective","Ineffective"]
df_pred = pd.DataFrame(pred,columns=["Effective","Ineffective"])

df_pred



In [None]:

df8=df_pred[['Effective','Ineffective']]
df8['max']=df8.max(axis=1)
df8['quality']=''
for i in df8[df8['max']==df8['Effective']].index:
     df8.iloc[i,3:]='Effective'
for i in df8[df8['max']==df8['Ineffective']].index:
     df8.iloc[i,3:]='Ineffective'



In [None]:

df8.to_excel('eff-ineff_q2_750enforced.xlsx')



In [None]:

#df8['guided_total'] = df8[['q1', 'q2','q3']].apply(lambda x: ' '.join(x), axis=1)



In [None]:

df8.to_excel('q2_guided_analysis.xlsx')

