# Analysing performance of word2vec, tf-idf and GloVe on text classification

In [0]:
#Get data from US CFB website for consumer complaints
!wget https://data.consumerfinance.gov/api/views/s6ew-h6mp/rows.csv?accessType=DOWNLOAD
# rename file
!mv /content/rows.csv?accessType=DOWNLOAD /content/rows.csv

#downlowad word2vec pretrained word embeddings from Spacy
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.0/en_core_web_md-2.2.0.tar.gz

In [0]:
import pandas as pd
import numpy as np
from numpy import random
import gensim
from gensim.models.doc2vec import TaggedDocument
import spacy
import nltk
import os
import pickle

pd.set_option('max_colwidth',1000)

In [0]:
#Read from file and remove NA/Null values from dataframe
dir_path = '/content/'

df = pd.read_csv(os.path.join(dir_path, 'rows.csv'))
df = df[pd.notnull(df['Consumer complaint narrative'])]

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 481948 entries, 5 to 1474449
Data columns (total 18 columns):
Date received                   481948 non-null object
Product                         481948 non-null object
Sub-product                     429776 non-null object
Issue                           481948 non-null object
Sub-issue                       355148 non-null object
Consumer complaint narrative    481948 non-null object
Company public response         234297 non-null object
Company                         481948 non-null object
State                           480095 non-null object
ZIP code                        373611 non-null object
Tags                            82657 non-null object
Consumer consent provided?      481948 non-null object
Submitted via                   481948 non-null object
Date sent to company            481948 non-null object
Company response to consumer    481947 non-null object
Timely response?                481948 non-null object
Consumer 

In [0]:
#The analysis is on text classification, so the columns that are important are 
#Product(labels) and Consumer Complaint Narrative (complaints from consumer)
df = df[['Product','Consumer complaint narrative']]

In [0]:
#Working on a subset of the problem for faster processing
df = df[:10000]

In [0]:
#Load the pretrained model for English langauge from Spacy and stopwords
import en_core_web_md

nlp = en_core_web_md.load()
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [0]:
#compaints are docs/text and categories are tags
complaints = df['Consumer complaint narrative']
categories = df['Product']

In [0]:
#Preprocessing for lowercase, tokenization and removing accents from characters
#complaint_ids = np.arange(len(complaints))
complaints_tokens = [gensim.utils.simple_preprocess(comp, deacc=True) for comp in complaints]

In [0]:
#Converting back the tokenized complaints into sentence because the pretrained English word model 
#takes input as string (sentences) and not in tokenzied form
simple_complaints = []
for tokens in complaints_tokens:
			simple = " ".join(tokens)  # concatenate back to a sentence
			simple_complaints.append(simple)

In [0]:
allowed_postags=['ADV', 'VERB', 'ADJ', 'NOUN', 'PROPN', 'NUM']

In [0]:
#Lemmataizing text using English word model and removing POS not allowed and stopwords
complaints_words = [] #lemmatized
count=0
for each_complaint in simple_complaints:
  each_complaint_nlp = nlp(each_complaint)
  tokens = [token.lemma_ for token in each_complaint_nlp if (token.pos_ in allowed_postags) and (token.text not in stop_words)]
  complaints_words.append(tokens)

In [0]:
#Saving the lemmatized form as pickle because it takes a lot of time for processing
import pickle

# Save all_docs as pickle.
with open(os.path.join(dir_path, 'complaints_words_2.pickle'), 'wb') as f:
    pickle.dump(complaints_words, f, pickle.HIGHEST_PROTOCOL)

In [0]:
# Read pickle.
with open(os.path.join(dir_path, 'complaints_words_2.pickle'), 'rb') as f:
    complaints_words = pickle.load(f)

In [0]:
#used for doc2word so not important right now
#complaint_ids = np.arange(len(complaints))
#all_complaints = [TaggedDocument(words=words, tags=[tag]) for words, tag in zip(complaints_words, complaint_ids)]

In [27]:
#word2vec
import multiprocessing
import sys
from gensim.models.word2vec import Word2Vec

workers = multiprocessing.cpu_count()
print('number of cpu: {}'.format(workers))
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise."

number of cpu: 2


In [0]:
#Creating word2vec model with 100 dimensions
word_model = Word2Vec(complaints_words,
                      min_count=2,
                      size=100,
                      window=5,
                      workers=workers,
                      iter=100)


In [0]:
#word averaging for each sentence for weighted word2vec
# Returns the weighted mean of the sentence from word vectors in the sentence, else zero
def word_average(sentence):
  mean_doc = []

  for word in sentence:
    if word in word_model.wv.vocab:
      mean_doc.append(word_model.wv.get_vector(word))
      pass
    pass
    
  if not mean_doc:
    return np.zeros(word_model.wv.vector_size)
  else:
    mean_doc = np.array(mean_doc).mean(axis=0)
    pass

  return mean_doc

In [0]:
#Averaging word2vec embedding for each document/complaint
weighted_word2vec_complaint_vector = np.vstack([word_average(sent) for sent in complaints_words])

In [0]:
#TF_IDF

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import namedtuple, defaultdict

In [0]:
#Calculate idf weights for each word
word_idf_weight = []
#Converting the lemmatized words into sentences for TFidf vectorizer input
complaints_setences = [" ".join(doc) for doc in complaints_words]

tfidf = TfidfVectorizer()
tfidf.fit(complaints_setences) #accepts sentence strings as an array

#For words with idf as zero, maximum IDF value is assigned for equal consideration (no bias) as most known word
max_idf = max(tfidf.idf_)
word_idf_weight = defaultdict(lambda: max_idf,
										   [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])

In [0]:
#utility function to combine the weighted word2vec (term frequency) and idf weight for each sentence
def word_average_with_idf(sentence):
  mean_doc = []

  for word in sentence:
    if word in word_model.wv.vocab:
      mean_doc.append(word_model.wv.get_vector(word) * word_idf_weight[word])
      pass
    pass
    
  if not mean_doc:
    return np.zeros(word_model.wv.vector_size)
  else:
    mean_doc = np.array(mean_doc).mean(axis=0)
    pass

  return mean_doc

In [0]:
#Averaging tfidf weighted embedding for each document/complaint
weighted_tfidf_complaint_vector = np.vstack([word_average_with_idf(sent) for sent in complaints_words])

In [0]:
#GloVe

from gensim.test.utils import get_tmpfile, datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api

In [151]:
#Download glove pretrained embedding from the GloVe website
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip '/content/glove.twitter.27B.zip'
!mv /content/glove.twitter.27B/glove.6B.100d.txt ../glove.6B.100d.txt

Archive:  /content/glove.twitter.27B.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of /content/glove.twitter.27B.zip or
        /content/glove.twitter.27B.zip.zip, and cannot find /content/glove.twitter.27B.zip.ZIP, period.


In [0]:
# Load in GloVe vector
glove_vec_fi = datapath('/content/glove.6B.100d.txt')
tmp_word2vec_fi = get_tmpfile('tmp_glove2word2vec.txt')

#Convert glove representation into word2vec format and store it in tmp_word2vec_fi
glove2word2vec(glove_vec_fi, tmp_word2vec_fi)

#Read the stored tmp_word2vec_fi file to load glove in word2vec format
glove_word_model = KeyedVectors.load_word2vec_format(tmp_word2vec_fi)

In [0]:
#word averaging for each sentence based upon GloVe embeddings:

def word_average_glove(sentence):
  mean_doc = []

  for word in sentence:
    if word in glove_word_model.wv.vocab:
      mean_doc.append(glove_word_model.wv.get_vector(word))
      pass
    pass
    
  if not mean_doc:
    return np.zeros(glove_word_model.wv.vector_size)
  else:
    mean_doc = np.array(mean_doc).mean(axis=0)
    pass

  return mean_doc

In [0]:
#Averaging word2vec embeddings for each document/complaint
weighted_glove_complaint_vector = np.vstack([word_average_glove(sent) for sent in complaints_words])

In [114]:
#weighted_idf_word2vec_complaint_vector.shape
#weighted_word2vec_complaint_vector.shape
#weighted_glove_complaint_vector.shape

(10000, 100)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# Classification via Logistic Model
logistic = LogisticRegression(random_state=1, multi_class='multinomial', solver='saga')

# Stochastic Gradient Descent classifier
sgd = SGDClassifier(loss='hinge',
                    verbose=1,
                    random_state=1,
                    learning_rate='invscaling',
                    eta0=1)

In [0]:
df = weighted_word2vec_complaint_vector
#df = weighted_idf_word2vec_complaint_vector
#df = weighted_glove_complaint_vector

In [142]:
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

model = logisitic

train_size = math.floor(len(df) * 0.8)
test_size = len(df) - train_size

train_X, test_X, train_y, test_y = train_test_split(df,
                                                    categories,
                                                     test_size=test_size,
                                                    random_state=1,
                                                    stratify=categories)

print('Shape of train_X: {}'.format(train_X.shape))
print('Shape of text_X: {}'.format(test_X.shape))

model.fit(train_X, train_y)

pred = model.predict(train_X)
true = np.array(train_y)

print('word2vec Score on Training dataset...\n')
print('Confusion Matrix:\n', confusion_matrix(true, pred))
print('\nClassification Report:\n', classification_report(true, pred, target_names=None))
print('\naccuracy: {:.3f}'.format(accuracy_score(true, pred)))
print('f1 score: {:.3f}'.format(f1_score(true, pred, average='weighted')))

pred_test = model.predict(test_X)
true_test = np.array(test_y)

print('word2vec Score on testing dataset...\n')
print('\naccuracy: {:.3f}'.format(accuracy_score(true_test, pred_test)))
print('f1 score: {:.3f}'.format(f1_score(true_test, pred_test, average='weighted')))

Shape of train_X: (8000, 100)
Shape of text_X: (2000, 100)
-- Epoch 1
Norm: 4.62, NNZs: 100, Bias: -2.972238, T: 8000, Avg. loss: 0.204910
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 3.89, NNZs: 100, Bias: -2.795986, T: 16000, Avg. loss: 0.119723
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 3.53, NNZs: 100, Bias: -2.676746, T: 24000, Avg. loss: 0.102552
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 3.28, NNZs: 100, Bias: -2.560855, T: 32000, Avg. loss: 0.097128
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 3.12, NNZs: 100, Bias: -2.400735, T: 40000, Avg. loss: 0.093413
Total training time: 0.02 seconds.
-- Epoch 6
Norm: 2.98, NNZs: 100, Bias: -2.303200, T: 48000, Avg. loss: 0.090969
Total training time: 0.02 seconds.
-- Epoch 7
Norm: 2.84, NNZs: 100, Bias: -2.246695, T: 56000, Avg. loss: 0.088850
Total training time: 0.02 seconds.
-- Epoch 8
Norm: 2.76, NNZs: 100, Bias: -2.136931, T: 64000, Avg. loss: 0.087167
Total training time: 0.02 seconds.
-- Epoch 9
Nor

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Norm: 0.80, NNZs: 100, Bias: -0.970464, T: 104000, Avg. loss: 0.102639
Total training time: 0.05 seconds.
-- Epoch 14
Norm: 0.75, NNZs: 100, Bias: -0.951973, T: 112000, Avg. loss: 0.101967
Total training time: 0.05 seconds.
-- Epoch 15
Norm: 0.72, NNZs: 100, Bias: -0.928730, T: 120000, Avg. loss: 0.101540
Total training time: 0.06 seconds.
Convergence after 15 epochs took 0.06 seconds
-- Epoch 1
Norm: 2.53, NNZs: 100, Bias: -1.550156, T: 8000, Avg. loss: 0.185460
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 2.27, NNZs: 100, Bias: -1.530823, T: 16000, Avg. loss: 0.146517
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 2.13, NNZs: 100, Bias: -1.479567, T: 24000, Avg. loss: 0.139781
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 2.03, NNZs: 100, Bias: -1.467901, T: 32000, Avg. loss: 0.137907
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 1.97, NNZs: 100, Bias: -1.438451, T: 40000, Avg. loss: 0.134821
Total training time: 0.02 seconds.
-- Epoch 6
Norm: 1.92, NNZs: 100,

[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    0.7s finished
