<a href="https://colab.research.google.com/github/sharyudeshmukh82/Workplace-Behavior-Analysis-using-Text-Analysis/blob/main/Copy_of_MultiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tflearn
!pip install preprocessor



In [None]:
import pickle
import string
import tflearn
import json
import os
import numpy as np
import pandas as pd
import preprocessor as p
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.contrib import learn
from tflearn.data_utils import to_categorical, pad_sequences
from keras.models import model_from_json,Sequential
from keras.layers import Embedding,Dropout,LSTM,Bidirectional,Dense
from keras.utils import np_utils
os.environ['KERAS_BACKEND']='tensorflow'

In [None]:
def evaluate_model(model, testX, testY):
    temp = model.predict(testX)
    y_pred  = np.argmax(temp, 1)
    y_true = np.argmax(testY, 1)
    precision = metrics.precision_score(y_true, y_pred, average=None)
    recall = metrics.recall_score(y_true, y_pred, average=None)
    f1_score = metrics.f1_score(y_true, y_pred, average=None)
    print("Precision: " + str(precision) + "\n")
    print("Recall: " + str(recall) + "\n")
    print("f1_score: " + str(f1_score) + "\n")
    print(confusion_matrix(y_true, y_pred))
    print(":: Classification Report")
    print(classification_report(y_true, y_pred))
    return precision, recall, f1_score

In [None]:
def save_model(data,model_type,model,embed_size):
    weight_file_name = "multi.h5"
    model_file_name =   "multi.json"
    model_json = model.to_json()
    model.save("multi.json")
    with open(model_file_name, "w+") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(weight_file_name)
    print("Saved model to disk")


In [None]:
def lstm(inp_dim,vocab_size, embed_size, num_classes, learn_rate):
  model = Sequential()
  model.add(Embedding(vocab_size, embed_size, input_length=inp_dim, trainable=True))
  model.add(Dropout(0.2))
  model.add(LSTM(embed_size))
  model.add(Dropout(0.5))
  model.add(Dense(num_classes,activation='softmax'))

  model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

  return model

In [None]:
def train(data_dict, embed_size,data,dump_embeddings=False):
    global NUM_CLASSES,LEARN_RATE,BATCH_SIZE,EPOCHS
    data, trainX, trainY, testX, testY, vocab_processor = data_dict["data"], data_dict["trainX"], data_dict["trainY"], data_dict["testX"], data_dict["testY"], data_dict["vocab_processor"]

    vocab_size = len(vocab_processor.vocabulary_)
    print("Vocabulary Size: {:d}".format(vocab_size))
    vocab = vocab_processor.vocabulary_._mapping
    print(vocab)
    print("TrainX shape",trainX.shape[1])
    print("Running Model: " + model_type )
    model = lstm(trainX.shape[1], vocab_size, embed_size,8, LEARN_RATE)
    print(model.summary())

    model.fit(trainX, trainY, epochs=EPOCHS, shuffle=True, batch_size=BATCH_SIZE,
                  verbose=1)
    save_model(data,model_type,model,embed_size)

    return  evaluate_model(model, testX, testY)

In [None]:
def get_train_test(data, x_text, labels):
    global NUM_CLASSES
    X_train, X_test, Y_train, Y_test = train_test_split( x_text, labels, random_state=42, test_size=0.30)
    print("Length",len(X_train))
    post_length = np.array([len(x.split(" ")) for x in x_text])
    print("Post Length",post_length)
#     from collections import Counter
#     print(Counter(post_length))
    #take 95 percentile of size as input dim of text
    max_document_length = int(np.percentile(post_length,95))


    print("Document length : " + str(max_document_length))

    #converting words into integers ie representing text input into vectors
    #max_document is input dim
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, MAX_FEATURES)
    #finds the unique words in corpus
    vocab_processor = vocab_processor.fit(x_text)
    #convert them into aactual vectors
    trainX = np.array(list(vocab_processor.transform(X_train)))
    testX = np.array(list(vocab_processor.transform(X_test)))
#     print('before seq')
#     print(trainX)
#     print(testX)
#     print(type(Y_train))
    trainY = (Y_train).astype('int32')
    testY = np.asarray(Y_test)
#     print("******************************")
#     print("Test data labels:")
#     print("Type",type(trainY))
#     print(trainY.shape)
#     print(testY)
    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
#     print('after seq')
#     print(trainX)
#     print(testX)
#     trainY = to_categorical(trainY, nb_classes=8)
#     testY = to_categorical(testY, nb_classes=8)
#     print("******************************")
#     print("Test data labels after to_categorical:")
#     print(trainY)
#     print(testY)
    trainY = np_utils.to_categorical(trainY)
    testY = np_utils.to_categorical(testY)
    print("Shape",trainY.shape)
    data_dict = {
        "data": data,
        "trainX" : trainX,
        "trainY" : trainY,
        "testX" : testX,
        "testY" : testY,
        "vocab_processor" : vocab_processor
    }

    return data_dict

In [None]:
def load_data(filename):
    global HASH_REMOVE
    print("Loading data from file: " + filename)
    df = pd.read_csv(filename,sep=',')
    x_text= np.array(df['Message'])
#     print(x_text)
#     print(type(data))
    labels = np.array(df['Label'])
#     print(labels)
#     print(type(labels))
    return x_text,labels

In [None]:
def get_data(data):
    global NUM_CLASSES
    file_name = 'Class Labelled dataset.csv.txt'

    x_text, labels = load_data(file_name)
    print(len(x_text))

    from collections import Counter
    print(Counter(labels))

    #remove punctuations
    filter_data = []
    for text in x_text:
        filter_data.append("".join(l for l in text if l not in string.punctuation))
#     print("Filtered data",(filter_data[1]))
    return filter_data, labels

In [None]:
def run_model(data, embed_size):
    x_text, labels = get_data(data)
    data_dict = get_train_test(data,  x_text, labels)
    precision, recall, f1_score = train(data_dict, embed_size,data)
    print("Precision",precision)
    print("Recall",recall)
#     print("f1-score",f1_score)


In [None]:
def predict():
    file = "multi.json"
    weight = "multi.h5"
    data = "multi"
    oversampling_rate = 3

    #load the model
    json_file = open(file, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)

    # load weights into new model
    loaded_model.load_weights(weight)
    print("Loaded model from disk")
    x_text, labels = get_data(data)
    data_dict = get_train_test(data,  x_text, labels)
#     print("Printing model summary ")
    evaluate_model(loaded_model, data_dict['testX'], data_dict['testY'])
#     print(x_text[6])

    post_length = np.array([len(x.split(" ")) for x in x_text])

    max_document_length = int(np.percentile(post_length,95))

    print("Document length : " + str(max_document_length))

    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, MAX_FEATURES)
    vocab_processor = vocab_processor.fit(x_text)
    inp = 0
    while inp != "":
        inp = input()
        text = ""
        for c in inp:
            if c not in string.punctuation:
                text+=c
#         print(text)
        text_list = [text]
        text_vector = np.array(list(vocab_processor.transform(text_list)))
#         print(text_vector)
        ans = loaded_model.predict(text_vector)
#         print(ans)
        prediction= list(ans[0])
        print(prediction)
        index=prediction.index(max(prediction))
#         print("index",index)
        class_dict= {1:"Activities",2:"Personal Information",3:"Compliment",4:"Relationship",5:"Reframing",6:"Communicative Desensitization",7:"Isolation",0:"Approach"}
        print("Class",class_dict[index])

In [None]:
#main function
EPOCHS = 15
BATCH_SIZE = 32

NUM_CLASSES = None
DROPOUT = 0.25
LEARN_RATE = 0.01
MAX_FEATURES =8



data = "multi"
model_type = "lstm"
embed_size = 128

n = input()
if n == "train":
    run_model(data, embed_size)
else:
    predict()

predict
Loaded model from disk
Loading data from file: Class Labelled dataset.csv.txt
65759
Counter({6: 24256, 3: 13735, 7: 11018, 0: 7024, 2: 6685, 5: 2052, 1: 544, 4: 445})
Length 46031
Post Length [ 6 10  4 ...  7  5 11]
Document length : 24
Shape (46031, 8)
Precision: [0.80907604 0.78651685 0.83879493 0.83470885 0.80508475 0.82743363
 0.88488613 0.90708402]

Recall: [0.93614002 0.90909091 0.79909366 0.87418968 0.71969697 0.64930556
 0.87522184 0.84035409]

f1_score: [0.86798246 0.84337349 0.81846313 0.8539932  0.76       0.72762646
 0.88002745 0.87244494]

[[1979    0   15   34    1   10   50   25]
 [   1  140    1    5    0    0    3    4]
 [  48    4 1587   93    4    3  178   69]
 [ 128    6   33 3641    2   23  303   29]
 [   3    1    6    7   95    1    4   15]
 [  10    0   21   26    0  374  110   35]
 [ 160   12  128  463   13   33 6411  105]
 [ 117   15  101   93    3    8  186 2753]]
:: Classification Report
              precision    recall  f1-score   support

        