### Import necessary files

In [1]:
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import classification_report
import torch
from models import InferSent
import pandas as pd
from spellchecker import SpellChecker
import nltk
import re

In [2]:
replacements={'sc':['rel','md','reqd','reld','ofc','immed','fwdd','wk','agn','cust','eowk','eow','eod','inv','emd','md','ck','chk','po','p o','sched','pd','pmt','pymt','pymnt','amt','rcvd','recieved','rec','recd',"recd"],
             'ff':['released','mailed','required','released','ofcourse','immediately','forwarded','week','again','customer','date','date','time','invoice','mailed','mailed','check','check','purchase order','purchase order','scheduled','paid','payment','payment','payment','amount','received','received','received','received','received']}
replacements=dict(zip(replacements['sc'],replacements['ff']))

def replace(text):
    text = re.sub(r'\'', '', text)
    #print(text)
    words=nltk.word_tokenize(text)
    words= [w if  w not in replacements.keys() else replacements[w] for w in words]
    return ' '.join(words)

In [3]:
replace("rec'd")

'received'

In [4]:
import nltk
spell = SpellChecker()

def correct_spell(text):
    words=nltk.word_tokenize(text)
    #print(type(words))
    i=0
    for word in words:
        #print(word)
        misspelled = spell.unknown([word])
        #print(i, word)
        #print(len(misspelled))
        for word in misspelled:
            #print("Inside 2nd for loop")
            words[i]=spell.correction(word)
            #print(spell.correction(word))
        i=i+1
    return ' '.join(words)
    

In [8]:
rep=replace("My ndme rec'd S")
print(rep)
print(correct_spell(rep))

My ndme received S
My name received S


In [9]:
def clean_text(text):
    text=re.sub(r'\s+', ' ', text)
    text=text.lower()
    text=re.sub(r'(\d+/\d+/\d+)|(\d+\.\d+\.\d+)|(\d+\-\d+\-\d+)|(\d+\/\d+)|(\d+th)|(\d+nd)|(\d+rd)|(\d+st)', ' DATE ', text)
    text=re.sub(r'\b(mon|tue|wed|thurs|fri|sat|sun|monday|tuesday|wednesday|thursday|friday|saturday|sunday|jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|july|august|september|october|november|december)\b',' DATE ', text)
    text=re.sub(r'(\$\d+\,\d+\.\d+)|(\$\d+\,\d+)|(\$\d+\.\d+)|(\$\d+)|(\$\ d+\,\d+\.\d+)|(\$ \d+\,\d+)|(\$ \d+\.\d+)|(\$ \d+)|(\d+\,\d+\.\d+)|(\d+\,\d+)|(\d+\.\d+)', ' AMOUNT ', text)
    text=re.sub(r'(#\d+)|(# \d+)|(\d+)', ' NUMBER ', text)
    text=re.sub(r'(\d+\.\d+)|(\d+)', ' AMOUNT ', text)
    text=re.sub(r'[^\s]+@[^\s]+\.[^\s]+',' MAIL ', text)
    text=re.sub(r'\s+', ' ', text)
    text=re.sub(r'(\()|(\))', '', text)
    text=re.sub(r'[^a-zA-Z]', ' ', text)
    text=re.sub(r'\s+', ' ', text)
    text=text.lower()
    text=replace(text)
    text=correct_spell(text)
    return text

In [10]:
def preprocess(lines):
    final_lines = list()
    for line in lines:
        #print(line)
        text = clean_text(line)
        #print(text)
        final_lines.append(text)
    return final_lines

In [11]:
train = pd.read_csv("C:/NLP/Data/labelled_transcripts.csv")

In [12]:
train.columns

Index(['Unnamed: 0', 'sentence', 'label'], dtype='object')

In [13]:
train.shape

(864, 3)

In [14]:
train[train['label']==0]["sentence"]

0                             Hello, this is John Smith.
1                                    Hi, this is Amanda.
2                                           How are you?
3                                              I'm fine.
4                                           How are you?
5                                             I am good.
6           Actually I'm able to see transcript as well.
7                                           Okay, great.
8                                     Either timing out.
9      Like what time is over but I don't know at lea...
10                                                 Okay.
11                          Can you check that our dogs?
13       Just going to call and and make the call again.
14                Now you won't be able to see anything.
15                                                 Kesha
16     Hello, this is Faith here if Isaac David from ...
17                                    How are you doing?
18                             

In [15]:
summary = pd.read_csv("D:/NLP/Data/Test_summary_5cluster_2300data.csv")
summary.columns

Index(['Unnamed: 0', 'text', 'summary', 'Clustering Layer',
       'Supervised Layer'],
      dtype='object')

In [16]:
op_summary = summary.loc[:,"Clustering Layer"]

In [17]:
train.columns, op_summary.name

(Index(['Unnamed: 0', 'sentence', 'label'], dtype='object'),
 'Clustering Layer')

In [18]:
op_summary[0]

"Google it just wanted to kind of ask you regarding field invoices sound like to make payment for a long time it is give me an update on the same. Okay, I see give me a minute. I have 10 invoices pending for payment getting tell me which one was how you talking about. I'm actually talking about the legal services if there was an old invoices. We're open from 3 til reason. Let me check the different days out. So I have 14 voices in the month off.Tuba call back relations. We just need a payment today in the morning. So I think I should be able to expect you like to receive it in and maybe a couple of days. Have a nice day."

In [19]:
y=train.loc[:,'label']

In [20]:
y.value_counts()

0    666
1    198
Name: label, dtype: int64

In [21]:
GLOVE_PATH = 'glove.840B.300d.txt'
MODEL_PATH = '/NLP/InferSent/encoder/infersent1.pickle'
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))
model.set_w2v_path(GLOVE_PATH)

In [22]:
sentences=train['sentence'].copy()

In [23]:
sentences[0]

'Hello, this is John Smith.'

In [24]:
clean_sentences = preprocess(sentences)

In [25]:
type(clean_sentences)

list

In [26]:
len(clean_sentences)

864

In [27]:
clean_sentences[0]

'hello this is john smith'

In [28]:
model.build_vocab(clean_sentences, tokenize=True)

Found 779(/781) words with w2v vectors
Vocab size : 779


In [29]:
embeddings = []
for i in range(len(clean_sentences)):
    #print(i)
    embeddings.append(model.encode([clean_sentences[i]], tokenize=True))

In [30]:
embeddings[0][0]

array([ 0.07358228,  0.05932116, -0.01329149, ..., -0.03116866,
       -0.03814263,  0.10867259], dtype=float32)

In [31]:
embedding_features =pd.DataFrame()
for i in embeddings:
    embedding_features = embedding_features.append(pd.Series(list(i[0])),ignore_index=True)

In [32]:
embedding_features.to_csv("C:/NLP/Data/embedding_features_transcripts.csv")

In [31]:
embedding_features=pd.read_csv('C:/NLP/Data/Embeddings1.csv')

In [33]:
embedding_features.columns

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            4086, 4087, 4088, 4089, 4090, 4091, 4092, 4093, 4094, 4095],
           dtype='int64', length=4096)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embedding_features, y, test_size=0.2, stratify=y)

In [35]:
X_train.shape

(691, 4096)

In [36]:
from sklearn.linear_model import LogisticRegression
s_model=LogisticRegression()
s_model.fit(X_train.iloc[:,:], y_train)
proba=s_model.predict_proba(X_test.iloc[:,:])[:,0]



In [37]:
import numpy as np
y_pred=np.where(proba > 0.5, 0, 1)

In [38]:
pd.Series(y_pred).value_counts()

0    131
1     42
dtype: int64

In [39]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.92      0.92       133
           1       0.74      0.78      0.76        40

   micro avg       0.88      0.88      0.88       173
   macro avg       0.83      0.85      0.84       173
weighted avg       0.89      0.88      0.89       173



In [40]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[122  11]
 [  9  31]]


# Testing

In [41]:
from textblob import TextBlob
op_summary[0]

"Google it just wanted to kind of ask you regarding field invoices sound like to make payment for a long time it is give me an update on the same. Okay, I see give me a minute. I have 10 invoices pending for payment getting tell me which one was how you talking about. I'm actually talking about the legal services if there was an old invoices. We're open from 3 til reason. Let me check the different days out. So I have 14 voices in the month off.Tuba call back relations. We just need a payment today in the morning. So I think I should be able to expect you like to receive it in and maybe a couple of days. Have a nice day."

In [49]:
cleaned_summaries = []
y_pred_list=[]
y_prob_list=[]
for para in op_summary:
    blob = TextBlob(para)
    sentences = [item.raw for item in blob.sentences]
    clean_sentences = preprocess(sentences)
    
    embeddings = []
    for i in range(len(clean_sentences)):
        #print(i)
        embeddings.append(model.encode([clean_sentences[i]], tokenize=True))
    
    embedding_features =pd.DataFrame()
    for i in embeddings:
        embedding_features = embedding_features.append(pd.Series(list(i[0])),ignore_index=True)
    
    print(embedding_features.shape)
    proba=s_model.predict_proba(embedding_features)[:,0]
    y_pred=np.where(proba > 0.95, 0, 1)
    print(y_pred)
    print(proba)
    summ=[]
    count = 0
    for label in y_pred:
        print(label)
        if(label == 1):
            summ.append(sentences[count])
        count=count+1
    cleaned_summaries.append(summ)
    y_pred_list.append(y_pred)
    y_prob_list.append(proba)

(10, 4096)
[1 0 1 1 1 1 1 1 1 0]
[0.46235407 0.95919385 0.12660614 0.6524021  0.7012074  0.83115668
 0.06495345 0.285025   0.3437336  0.97697969]
1
0
1
1
1
1
1
1
1
0
(6, 4096)
[0 1 0 1 0 1]
[0.98468111 0.65739082 0.98795823 0.65156582 0.98524245 0.04170872]
0
1
0
1
0
1
(10, 4096)
[1 1 0 1 1 1 1 1 1 0]
[0.06016042 0.75360741 0.98838518 0.94588685 0.02180775 0.92645726
 0.11483505 0.07092553 0.02234676 0.97697969]
1
1
0
1
1
1
1
1
1
0
(13, 4096)
[1 1 1 1 1 1 1 1 1 1 1 1 1]
[0.01806722 0.5495103  0.05640715 0.66280786 0.51488472 0.70946572
 0.63529669 0.42342071 0.42012622 0.396481   0.02683006 0.80922385
 0.25661954]
1
1
1
1
1
1
1
1
1
1
1
1
1
(9, 4096)
[1 1 1 1 1 1 1 1 0]
[0.02826041 0.18087699 0.37022611 0.69368866 0.04821991 0.80981912
 0.77855417 0.39789763 0.98569878]
1
1
1
1
1
1
1
1
0
(15, 4096)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[0.01979671 0.17328189 0.03289811 0.52932827 0.60248961 0.28592634
 0.01236165 0.93729825 0.12876548 0.15301161 0.05448464 0.10224862
 0.10233614 0.06830175 0.

In [61]:
print(cleaned_summaries[0])

['Google it just wanted to kind of ask you regarding field invoices sound like to make payment for a long time it is give me an update on the same.', 'I have 10 invoices pending for payment getting tell me which one was how you talking about.', "I'm actually talking about the legal services if there was an old invoices.", "We're open from 3 til reason.", 'Let me check the different days out.', 'So I have 14 voices in the month off.Tuba call back relations.', 'We just need a payment today in the morning.', 'So I think I should be able to expect you like to receive it in and maybe a couple of days.']


In [62]:
print(op_summary[0])

Google it just wanted to kind of ask you regarding field invoices sound like to make payment for a long time it is give me an update on the same. Okay, I see give me a minute. I have 10 invoices pending for payment getting tell me which one was how you talking about. I'm actually talking about the legal services if there was an old invoices. We're open from 3 til reason. Let me check the different days out. So I have 14 voices in the month off.Tuba call back relations. We just need a payment today in the morning. So I think I should be able to expect you like to receive it in and maybe a couple of days. Have a nice day.
