In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd

In [4]:
df =  pd.read_csv('/gdrive/MyDrive/NLP/complaints.csv')

In [5]:
df.head()

Unnamed: 0,Consumer complaint narrative,Product
0,I have outdated information on my credit repor...,Credit reporting
1,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
2,An account on my credit report has a mistaken ...,Credit reporting
3,This company refuses to provide me verificatio...,Debt collection
4,This complaint is in regards to Square Two Fin...,Debt collection


In [6]:
df['Consumer complaint narrative'][0]

'I have outdated information on my credit report that I have previously disputed that has yet to be removed this information is more then seven years old and does not meet credit reporting requirements'

In [10]:
print(df.Product.unique())

['Credit reporting' 'Consumer Loan' 'Debt collection' 'Mortgage'
 'Credit card' 'Other financial service' 'Bank account or service'
 'Student loan' 'Money transfers' 'Payday loan' 'Prepaid card'
 'Virtual currency'
 'Credit reporting, credit repair services, or other personal consumer reports'
 'Credit card or prepaid card' 'Checking or savings account'
 'Payday loan, title loan, or personal loan'
 'Money transfer, virtual currency, or money service'
 'Vehicle loan or lease']


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df['Consumer complaint narrative'].values, df['Product'].values, test_size=0.15, random_state=101)

In [12]:
print('Training utterances: {}'.format(X_train.shape[0]))
print('Validation utterances: {}'.format(X_test.shape[0]))

Training utterances: 152809
Validation utterances: 26967


Calculating tf-idf scores
Calculating tf-idf scores for each unique token in the dataset and creating frequency chart for each utterance in the dataset.

In [13]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

TfidfVectorizer()

In [14]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)
X_train, X_test

(<152809x76451 sparse matrix of type '<class 'numpy.float64'>'
 	with 13867648 stored elements in Compressed Sparse Row format>,
 <26967x76451 sparse matrix of type '<class 'numpy.float64'>'
 	with 2445087 stored elements in Compressed Sparse Row format>)

Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
clf = MultinomialNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(accuracy_score(y_test, pred))

0.6850224348277525


In [18]:
from sklearn.feature_selection import SelectKBest, chi2

ch2 = SelectKBest(chi2, k=5000)

X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

X_train, X_test

(<152809x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 10756825 stored elements in Compressed Sparse Row format>,
 <26967x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 1899873 stored elements in Compressed Sparse Row format>)

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf=MultinomialNB()
clf.fit(X_train,y_train)
pred=clf.predict(X_test)

print(accuracy_score(y_test,pred))

0.7674936032929136


Pre-processing on the dataset:

In [21]:
from nltk import word_tokenize

In [22]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Split the words

In [28]:
# We use word_tokenize to make sure words are split properly. Eg : don't , 3.14 etc
from tqdm import tqdm
data = df['Consumer complaint narrative'].values[0:1000]
tokenized = [word_tokenize(value) for value in data]
del data
tokenized[0]

['I',
 'have',
 'outdated',
 'information',
 'on',
 'my',
 'credit',
 'report',
 'that',
 'I',
 'have',
 'previously',
 'disputed',
 'that',
 'has',
 'yet',
 'to',
 'be',
 'removed',
 'this',
 'information',
 'is',
 'more',
 'then',
 'seven',
 'years',
 'old',
 'and',
 'does',
 'not',
 'meet',
 'credit',
 'reporting',
 'requirements']

In [27]:
#squares = []
# squares = [x**2 for x in range(10)]
    
# print(squares)
# [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [29]:
lower = [[word.lower() for word in tokens]for tokens in tokenized]
lower[0]

['i',
 'have',
 'outdated',
 'information',
 'on',
 'my',
 'credit',
 'report',
 'that',
 'i',
 'have',
 'previously',
 'disputed',
 'that',
 'has',
 'yet',
 'to',
 'be',
 'removed',
 'this',
 'information',
 'is',
 'more',
 'then',
 'seven',
 'years',
 'old',
 'and',
 'does',
 'not',
 'meet',
 'credit',
 'reporting',
 'requirements']

In [30]:
lower=[' '.join(text) for text in lower]
lower[0]


'i have outdated information on my credit report that i have previously disputed that has yet to be removed this information is more then seven years old and does not meet credit reporting requirements'

In [31]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [32]:
# Remove punctuations
import string
punctuations_removed=[text.translate(str.maketrans('','',string.punctuation)) for text in lower]
punctuations_removed[0]

'i have outdated information on my credit report that i have previously disputed that has yet to be removed this information is more then seven years old and does not meet credit reporting requirements'

In [33]:
'i have outdated information on my credit report that i have previously disputed that has yet to be removed this information is more then seven years old and does not meet credit reporting requirements'
# removing stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords=stopwords.words('english')
cleaned=[[word for word in sentence.split(' ') if word not in stopwords] for sentence in punctuations_removed]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [34]:
cleaned[0]

['outdated',
 'information',
 'credit',
 'report',
 'previously',
 'disputed',
 'yet',
 'removed',
 'information',
 'seven',
 'years',
 'old',
 'meet',
 'credit',
 'reporting',
 'requirements']

In [35]:
# Stemming
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
stemmer.stem('basically')

'basic'

In [36]:
stemmed=[[stemmer.stem(word) for word in sentence] for sentence in cleaned]

In [37]:
stemmed[0]

['outdat',
 'inform',
 'credit',
 'report',
 'previous',
 'disput',
 'yet',
 'remov',
 'inform',
 'seven',
 'year',
 'old',
 'meet',
 'credit',
 'report',
 'requir']

In [38]:
l=['a','b','c','a']
list(set(l))

['b', 'c', 'a']

In [39]:
# Create a BOW
vocab=[]
for sentence in stemmed:
  temp=list(set(sentence))
  vocab+=temp
  vocab=list(set(vocab))

In [40]:
len(vocab)

5122

In [41]:
# Create Index Map
map={word:idx for idx,word in enumerate(vocab)}

In [42]:
freq={word:0 for word in vocab}

In [43]:
map['placement']

2653

In [44]:
# BOW feature

for sentence in stemmed:
  for word in sentence:
    try:
      freq[word]+=1
    except:
      pass

Feature Selection
Chi-square test measures dependence between stochastic variables, so using this function “weeds out” the features that are the most likely to be independent of class and therefore irrelevant for classification.

In [45]:
from sklearn.feature_selection import SelectKBest, chi2

ch2 = SelectKBest(chi2, k=5000)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

X_train, X_test

(<152809x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 10756825 stored elements in Compressed Sparse Row format>,
 <26967x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 1899873 stored elements in Compressed Sparse Row format>)