# Libraries

In [1]:
import os
import re
import string
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
import string

from nltk.corpus import stopwords

from tqdm.notebook import tqdm

[nltk_data] Downloading package punkt to /Users/kagenlim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kagenlim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Overview

Types: 
- Abstractive: Generative Models; generally involves deep learning models. BERT Models can contribute to this. 
- Extractive: Extracting the 'most important sentences' from a document. 

We tried an Extractive Model, using Sentence Embeddings, based on pre-Trained GloVe vectors. 

Source: https://appliedmachinelearning.blog/2019/12/31/extractive-text-summarization-using-glove-vectors/ 

Paper: https://nlp.stanford.edu/pubs/glove.pdf 

# Testing Extractive Model: GloVe Embeddings

## Word Embeddings

In [2]:
import os
import re
import string
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import gc
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
import sklearn 
import collections
import sys
import itertools
import string

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import tqdm

[nltk_data] Downloading package punkt to /Users/kagenlim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kagenlim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def loadGloveModel(gloveFile):
    word_embeddings = {}
    f = open(gloveFile, encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    return word_embeddings

gloveFile = '/Users/kagenlim/Documents/data_files/glove.42B.300d.txt'
dim = 300

In [4]:
word_embeddings = loadGloveModel(gloveFile)
print("Vocab Size = ",len(word_embeddings))

Vocab Size =  1917494


## Loading Data in

In [5]:
df_narr = pd.read_csv("/Users/kagenlim/Documents/data_files/narrs_cleaned_sent_tokenized.csv")

## Test

In [8]:
df_narr['narrs_cleaned_sent_tokenized'][657718]

"['made payment citi credit card use citi prepaid debit card issu us furnaceair condit install', 'payment made phone repres citi simplic given conform number complet call', 'never say payment post account call citi back said record payment made', 'spoke sever depart get answer never took place', 'file disput citi also call citi rebat card compani immedi confirm payment made receiv citi credit card date', 'email us proof transact sent citi go forward dispute', 'receiv email show day later card balanc', 'payment remain balanc card thought disput resolved', 'yesterday receiv email state disput deni', 'put back onto balanc owe citi', 'call yesterday reopen disput refax proof along letter explan disput disput fax number citi also mail copi presidenti offic south dakota', 'speak furnac instal citi help even numer attempt resolv matter contact help case', 'sure els payment made receiv rebat card citi', 'balanc money received', 'pleas help us']"

In [9]:
df_narr['Consumer complaint narrative'][657718]

"['I made a payment to CITI XXXX Credit Card on XXXX XXXX, 2016 using a CITI prepaid Debit Card issued to us from a furnace/air conditioning install.', 'The payment was made over the phone with a representative from CITI Simplicity and I was given a conformation number to complete the call.', 'I never say the payment post to my account so I called CITI back and they said there was no record of a payment being made on the XXXX of XXXX.', 'I then spoke to several departments only to get the same answer that this never took place.', 'I then filed a dispute with CITI XXXX XXXX and also called the CITI rebate card company and they immediately confirmed that the payment had been made and received on XXXX XXXX by CITI Credit Card on that date.', 'We then had them email us proof of this transaction and sent it to CITI XXXX to go forward with the dispute.', 'I received an email showing a few days later that my card balance was at XXXX {$.00} as this payment was for the remaining balance on the 

In [None]:
test = df_narr['Consumer complaint narrative'][657718].strip('][').split(', ') 


In [None]:
test

["'I made a payment to CITI XXXX Credit Card on XXXX XXXX",
 "2016 using a CITI prepaid Debit Card issued to us from a furnace/air conditioning install.'",
 "'The payment was made over the phone with a representative from CITI Simplicity and I was given a conformation number to complete the call.'",
 "'I never say the payment post to my account so I called CITI back and they said there was no record of a payment being made on the XXXX of XXXX.'",
 "'I then spoke to several departments only to get the same answer that this never took place.'",
 "'I then filed a dispute with CITI XXXX XXXX and also called the CITI rebate card company and they immediately confirmed that the payment had been made and received on XXXX XXXX by CITI Credit Card on that date.'",
 "'We then had them email us proof of this transaction and sent it to CITI XXXX to go forward with the dispute.'",
 "'I received an email showing a few days later that my card balance was at XXXX {$.00} as this payment was for the rema

In [None]:
for sentence in range(len(test)):
      cleaned_sentences = [clean(sentence) for sentence in test]

In [None]:
cleaned_sentences

['i made payment citi  credit card  ',
 'using citi prepaid debit card issued us furnaceair conditioning install',
 'the payment made phone representative citi simplicity given conformation number complete call',
 'i never say payment post account called citi back said record payment made  ',
 'i spoke several departments get answer never took place',
 'i filed dispute citi   also called citi rebate card company immediately confirmed payment made received   citi credit card date',
 'we email us proof transaction sent citi  go forward dispute',
 'i received email showing days later card balance    payment remaining balance card thought dispute resolved',
 'then yesterday   received email stating dispute denied    put back onto balance owed citi  ',
 'we called yesterday   reopen dispute refaxed proof along letter explanation dispute dispute fax number citi  also mailed copy presidential office south dakota',
 'then speaking furnace installer citi  helping even numerous attempts resolve 

In [None]:
sentence_vectors = []
for i in cleaned_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((dim,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((dim,))
    sentence_vectors.append(v)

In [None]:
sim_mat = np.zeros([len(test), len(test)])
for i in range(len(cleaned_sentences)):
    for j in range(len(cleaned_sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,dim),sentence_vectors[j].reshape(1,dim))[0,0]
sim_mat = np.round(sim_mat,3)
print(sim_mat)

[[0.    0.887 0.853 0.907 0.777 0.957 0.852 0.903 0.853 0.817 0.779 0.868
  0.904 0.701]
 [0.887 0.    0.827 0.804 0.708 0.885 0.811 0.836 0.792 0.788 0.78  0.752
  0.86  0.708]
 [0.853 0.827 0.    0.896 0.839 0.885 0.883 0.893 0.865 0.883 0.854 0.854
  0.821 0.789]
 [0.907 0.804 0.896 0.    0.904 0.927 0.918 0.923 0.931 0.892 0.863 0.943
  0.829 0.783]
 [0.777 0.708 0.839 0.904 0.    0.815 0.862 0.865 0.881 0.868 0.889 0.908
  0.718 0.792]
 [0.957 0.885 0.885 0.927 0.815 0.    0.894 0.926 0.907 0.89  0.826 0.86
  0.918 0.721]
 [0.852 0.811 0.883 0.918 0.862 0.894 0.    0.903 0.921 0.912 0.874 0.878
  0.801 0.849]
 [0.903 0.836 0.893 0.923 0.865 0.926 0.903 0.    0.931 0.898 0.855 0.9
  0.879 0.758]
 [0.853 0.792 0.865 0.931 0.881 0.907 0.921 0.931 0.    0.912 0.867 0.879
  0.846 0.765]
 [0.817 0.788 0.883 0.892 0.868 0.89  0.912 0.898 0.912 0.    0.866 0.841
  0.776 0.77 ]
 [0.779 0.78  0.854 0.863 0.889 0.826 0.874 0.855 0.867 0.866 0.    0.857
  0.728 0.837]
 [0.868 0.752 0.854 0.94

In [None]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
print(scores)

{0: 0.07172409632177608, 1: 0.06827189302728183, 2: 0.07219690377346338, 3: 0.07429960383148114, 4: 0.07042954981629197, 5: 0.07369301940708346, 6: 0.07340256788163618, 7: 0.07402184142871525, 8: 0.07335074279805234, 9: 0.07202935096168626, 10: 0.07071139055047017, 11: 0.07193535977600128, 12: 0.06859241393839, 13: 0.06534126648767077}


In [None]:
#Original

test

["'I made a payment to CITI XXXX Credit Card on XXXX XXXX",
 "2016 using a CITI prepaid Debit Card issued to us from a furnace/air conditioning install.'",
 "'The payment was made over the phone with a representative from CITI Simplicity and I was given a conformation number to complete the call.'",
 "'I never say the payment post to my account so I called CITI back and they said there was no record of a payment being made on the XXXX of XXXX.'",
 "'I then spoke to several departments only to get the same answer that this never took place.'",
 "'I then filed a dispute with CITI XXXX XXXX and also called the CITI rebate card company and they immediately confirmed that the payment had been made and received on XXXX XXXX by CITI Credit Card on that date.'",
 "'We then had them email us proof of this transaction and sent it to CITI XXXX to go forward with the dispute.'",
 "'I received an email showing a few days later that my card balance was at XXXX {$.00} as this payment was for the rema

In [None]:
#Summary -- these are the original sentences

ranked_sentences = sorted(((scores[i],i) for i,s in enumerate(test)), reverse=True)
arranged_sentences = sorted(ranked_sentences[0:int(len(test)*0.5)], key=lambda x:x[1])
print("\n".join([test[x[1]] for x in arranged_sentences]))

'The payment was made over the phone with a representative from CITI Simplicity and I was given a conformation number to complete the call.'
'I never say the payment post to my account so I called CITI back and they said there was no record of a payment being made on the XXXX of XXXX.'
'I then filed a dispute with CITI XXXX XXXX and also called the CITI rebate card company and they immediately confirmed that the payment had been made and received on XXXX XXXX by CITI Credit Card on that date.'
'We then had them email us proof of this transaction and sent it to CITI XXXX to go forward with the dispute.'
'I received an email showing a few days later that my card balance was at XXXX {$.00} as this payment was for the remaining balance on the card and thought that the dispute had been resolved.'
'Then yesterday on XXXX XXXX I received the email stating the dispute had been denied and the {$370.00} was put back onto my balance owed on the CITI XXXX XXXX.'
'We Then called yesterday the XXXX 

In [None]:
def getting_extract_summaries(complaint):
  for sentence in range(len(complaint)):
      cleaned_sentences = [clean(sentence) for sentence in complaint] 
  sentence_vectors = []
  for i in cleaned_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((dim,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((dim,))
    sentence_vectors.append(v)
  sim_mat = np.zeros([len(complaint), len(complaint)])
  for i in range(len(cleaned_sentences)):
    for j in range(len(cleaned_sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,dim),sentence_vectors[j].reshape(1,dim))[0,0]
  nx_graph = nx.from_numpy_array(sim_mat)
  scores = nx.pagerank(nx_graph)
  ranked_sentences = sorted(((scores[i],i) for i,s in enumerate(complaint)), reverse=True)
  arranged_sentences = sorted(ranked_sentences[0:int(len(complaint)*0.5)], key=lambda x:x[1])
  summary = " ".join([complaint[x[1]] for x in arranged_sentences])

  return summary

In [None]:
#Original

original = df_narr['Consumer complaint narrative'][20000]
original

'[\'Company responded stating that the prepayment fee structure they imposed on me " is standard for a 5 year business loan \\\'\\\'.\', \'This statement is not true as I have the paperwork from when we took the first note with Associated bank.\', \'Initially we were promised a 5 year note on the first note back in XXXX.\', \'On that note it says that the pre-payment penalty is equal to 2 % of the outstanding principal amount.\', \'As a consumer we do not know their " practices \\\'\\\'.\', \'If I did not have the paper from XXXX I wold have not known that they are misleading me now on their response from XX/XX/XXXX XXXX\']'

In [None]:
len(original)

610

In [None]:
#Summary -- these are the original sentences

summary = getting_extract_summaries(df_narr['Consumer complaint narrative'][20000])
summary

'\'Company responded stating that the prepayment fee structure they imposed on me " is standard for a 5 year business loan \\\'\\\'.\' \'This statement is not true as I have the paperwork from when we took the first note with Associated bank.\' \'Initially we were promised a 5 year note on the first note back in XXXX.\''

In [None]:
#Original

original = df_narr['Consumer complaint narrative'][15000]
original

"['I am current on my payments so far.', 'The problem is I am late on the month of XX/XX/XXXX and the month of XX/XX/XXXX the XX/XX/XXXX payment has already been paid and yes maybe the other payment I have paid them a bit late but I am current.', 'I am experiencing a fraud situation with the bank that I currently make the payments with.', 'There was fraudulent activity going on and as I have disputed the transaction and open a claim complaint with the bank.', 'They said it would take up to 90 days to solve this matter and because it was more than 2000 dollar they need to further investigate.', 'My account was hacked as some hacker took my debit card numbers and hack them and used them in the state of Ohio and California I live in Illinois and have not shopped in those states.', 'I have informed me.cooper about my situation and they have denied to help me by not reporting it to the credit bureau as late payment and also to wave the late fees for XX/XX/XXXX and XX/XX/XXXX but they refuse

In [None]:
#Summary -- these are the original sentences

summary = getting_extract_summaries(df_narr['Consumer complaint narrative'][15000])
summary

"'The problem is I am late on the month of XX/XX/XXXX and the month of XX/XX/XXXX the XX/XX/XXXX payment has already been paid and yes maybe the other payment I have paid them a bit late but I am current.' 'I have informed me.cooper about my situation and they have denied to help me by not reporting it to the credit bureau as late payment and also to wave the late fees for XX/XX/XXXX and XX/XX/XXXX but they refused to as I explained the reasons of why I need there help its just the late fees.' 'They have already issue a new late fee this month but I sent them a pdf letter explaining the reasons of what was happening with the bank account and they still refused to not wave the late fees.' 'I am very happy that they have gave me all this much help but I need help now too this late fees were not my fault as I intend to pay of time before the XX/XX/XXXX or XX/XX/XXXX of echa month to avoid the late fee charge.'"

In [None]:
#Original

original = df_narr['Consumer complaint narrative'][20057]
original

'[\'I sent 4 dispute letters to all 3 credit bureaus ( XXXX, XXXX, and TransUnion ).\', "In the letter, I asked them to remove or verify 6 charge off \'s that I do not believe are mine.", \'Every time I got a response, they came back as verified.\', "How did they verify these charge off \'s belong to me and with who?", "They provided no proof ( original signed contracts ) that these charge off \'s accounts belong to me.", "They would only send me a copy of my credit report with the same charge off \'s still remaining.", \'This is a well-known tactic they use to keep people subprime.\', \'They violated my rights under the Fair Credit Reporting Act ( FCRA ) many times and I am fed up.\', \'I do understand that they sell my information to the banks.\', \'And that the banks make more money off me because I have a low credit score.\', \'And that way the bank can charge me a high-interest rate.\', \'Thus them making more money and me losing more money.\', "The charge off \'s on my account ar

In [None]:
#Summary -- these are the original sentences

summary = getting_extract_summaries(df_narr['Consumer complaint narrative'][15000])
summary

"'The problem is I am late on the month of XX/XX/XXXX and the month of XX/XX/XXXX the XX/XX/XXXX payment has already been paid and yes maybe the other payment I have paid them a bit late but I am current.' 'I have informed me.cooper about my situation and they have denied to help me by not reporting it to the credit bureau as late payment and also to wave the late fees for XX/XX/XXXX and XX/XX/XXXX but they refused to as I explained the reasons of why I need there help its just the late fees.' 'They have already issue a new late fee this month but I sent them a pdf letter explaining the reasons of what was happening with the bank account and they still refused to not wave the late fees.' 'I am very happy that they have gave me all this much help but I need help now too this late fees were not my fault as I intend to pay of time before the XX/XX/XXXX or XX/XX/XXXX of echa month to avoid the late fee charge.'"

In [None]:
#Original

original = df_narr['Consumer complaint narrative'][30015]
original

'[\'Over the past week, I have received 4 phone calls from Transworld Systems Incorporated ( TSI ).\', "When answering the phone, they have asked for my wife \'s name but do not identify themselves until asked, the reason that they are calling, or leave a voicemail.", \'Upon looking up their information on XXXX, they are a debt collection agency based out of XXXX XXXX, California.\', \'In addition, we have no debts that are owed.\', \'They have used phone numbers ( XXXX ) XXXX & ( XXXX ) XXXX.\']'

In [None]:
#Summary -- these are the original sentences

summary = getting_extract_summaries(df_narr['Consumer complaint narrative'][30015])
summary

'I have received 4 phone calls from Transworld Systems Incorporated ( TSI ).\' "When answering the phone they have asked for my wife \'s name but do not identify themselves until asked the reason that they are calling \'Upon looking up their information on XXXX \'They have used phone numbers ( XXXX ) XXXX & ( XXXX ) XXXX.\''

# Getting all

In [6]:
df = pd.DataFrame()

In [7]:
df['original'] = df_narr['Consumer complaint narrative']

df['cleaned'] = df_narr['narrs_cleaned_sent_tokenized']

In [8]:
df.head()

Unnamed: 0,original,cleaned
0,['transworld systems inc. is trying to collect...,['transworld system inc tri collect debt mine ...
1,"['Over the past 2 weeks, I have been receiving...",['past weeks receiv excess amount telephon cal...
2,['Pioneer has committed several federal violat...,['pioneer commit sever feder violat me privat ...
3,"['Previously, on XX/XX/XXXX, XX/XX/XXXX, and X...",['previously request experian send copi verifi...
4,['Hello This complaint is against the three cr...,['hello complaint three credit report companie...


In [9]:
df.original[0]

"['transworld systems inc. is trying to collect a debt that is not mine, not owed and is inaccurate.']"

In [10]:
df.cleaned[0]

"['transworld system inc tri collect debt mine owe inaccurate']"

In [11]:
def getting_extract_summaries(complaint_index=1):
  import ast
  cleaned_sentences = ast.literal_eval(df.cleaned[complaint_index])
  cleaned_original = ast.literal_eval(df.original[complaint_index])
  sentence_vectors = []
  for i in cleaned_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((dim,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((dim,))
    sentence_vectors.append(v)
  sim_mat = np.zeros([len(cleaned_original), len(cleaned_original)])
  for i in range(len(cleaned_sentences)):
    for j in range(len(cleaned_sentences)):
        if i != j:
          try:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,dim),sentence_vectors[j].reshape(1,dim))[0,0]
          except:
            pass
  nx_graph = nx.from_numpy_array(sim_mat)
  scores = nx.pagerank_numpy(nx_graph)
  try:
    ranked_sentences = sorted(((scores[i],i) for i,s in enumerate(cleaned_original)), reverse=True)
    arranged_sentences = sorted(ranked_sentences[0:int(len(cleaned_original)*0.5)], key=lambda x:x[1])
    summary = "".join([cleaned_original[x[1]] for x in arranged_sentences])
    return summary
  except:
    pass

In [12]:
getting_extract_summaries(1)

'The company does not have the right to harass me at work and I want this to stop.It is extremely distracting to be told 5 times a day that I have a call from this collection agency while at work.'

In [13]:
test = df_narr['narrs_cleaned_sent_tokenized'][0:15]

test[14]

"['would like credit bureau correct balance', 'correct balanc']"

In [14]:
tested = [getting_extract_summaries(i) for i in tqdm(range(len(test)))]

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [15]:
tested

['',
 'The company does not have the right to harass me at work and I want this to stop.It is extremely distracting to be told 5 times a day that I have a call from this collection agency while at work.',
 'Violations committed against me include but not limited to : ( 1 ) Violated 15 USC 1692c ( a ) ; Communication without prior consent, expressed permission.( 2 ) Violated 15 USC 1692d ; Harass and oppressive use of intercourse about an alleged debt.( 4 ) Violated 15 USC 1692e ( 9 ) ; Use/distribution of communication with authorization or approval.',
 'Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/XX/XXXX I requested that Experian send me a copy of the verifiable proof they have on file showing that the XXXX account they have listed on my credit report is actually mine.On XX/XX/XXXX and XX/XX/XXXX, instead of sending me a copy of the verifiable proof that I requested, Experian sent me a statement which reads, " The information you disputed has been verified as accurate. \'\'I have ag

In [16]:
all = [getting_extract_summaries(i) for i in tqdm(range(len(df)))]

HBox(children=(FloatProgress(value=0.0, max=657719.0), HTML(value='')))

  return dict(zip(G, map(float, largest / norm)))
  return dict(zip(G, map(float, largest / norm)))





In [17]:
backup = all.copy()

In [18]:
backup2 = all.copy()

In [20]:
all[0:10]

['',
 'The company does not have the right to harass me at work and I want this to stop.It is extremely distracting to be told 5 times a day that I have a call from this collection agency while at work.',
 'Violations committed against me include but not limited to : ( 1 ) Violated 15 USC 1692c ( a ) ; Communication without prior consent, expressed permission.( 2 ) Violated 15 USC 1692d ; Harass and oppressive use of intercourse about an alleged debt.( 4 ) Violated 15 USC 1692e ( 9 ) ; Use/distribution of communication with authorization or approval.',
 'Previously, on XX/XX/XXXX, XX/XX/XXXX, and XX/XX/XXXX I requested that Experian send me a copy of the verifiable proof they have on file showing that the XXXX account they have listed on my credit report is actually mine.On XX/XX/XXXX and XX/XX/XXXX, instead of sending me a copy of the verifiable proof that I requested, Experian sent me a statement which reads, " The information you disputed has been verified as accurate. \'\'I have ag

In [25]:
import pickle

with open('outfile.pkl', 'wb') as fp:
    pickle.dump(all, fp)

In [2]:
import pickle

with open ('extractive_summaries.pkl', 'rb') as fp:
    extractive_summaries = pickle.load(fp)

In [3]:
extractive_summaries[-1]

'The payment was made over the phone with a representative from CITI Simplicity and I was given a conformation number to complete the call.I never say the payment post to my account so I called CITI back and they said there was no record of a payment being made on the XXXX of XXXX.I then filed a dispute with CITI XXXX XXXX and also called the CITI rebate card company and they immediately confirmed that the payment had been made and received on XXXX XXXX by CITI Credit Card on that date.We then had them email us proof of this transaction and sent it to CITI XXXX to go forward with the dispute.I received an email showing a few days later that my card balance was at XXXX {$.00} as this payment was for the remaining balance on the card and thought that the dispute had been resolved.I am not sure what else to do as this payment was made, it was received and our Rebate Card with CITI is at a {$0.00} balance so the money was received.'