# Assignment 3 - Vectorizers
## Apply of Glove & Word2Vec Embeddings on dataset

### Import the necessary libraries

In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from gensim.models import KeyedVectors

### Download the data
Dataset: 
https://raw.githubusercontent.com/subashgandyer/datasets/main/deepnlp_classification_data.zip

### Load the train and test data

In [2]:
train = pd.read_csv('deepnlp_classification_data/r8-train-all-terms.txt', header=None, sep='\t')
test = pd.read_csv('deepnlp_classification_data/r8-test-all-terms.txt', header=None, sep='\t')
train.columns = ['label', 'content']
test.columns = ['label', 'content']

# 1. Glove Vectorizer

### Create a GloveVectorizer Class
- __init__
- transform
- fit_transform

In [3]:
class GloveVectorizer:
  def __init__(self):
    # load in pre-trained word vectors
    print('Loading word vectors from Glove...')
    word2vec = {}
    embedding = []
    idx2word = []
    with open('glove.6B.50d.txt') as f:
      # is just a space-separated text file in the format:
      # word vec[0] vec[1] vec[2] ...
      for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
    print('Found %s word vectors.' % len(word2vec))

    # save for later
    self.word2vec = word2vec
    self.embedding = np.array(embedding)
    self.word2idx = {v:k for k,v in enumerate(idx2word)}
    self.V, self.D = self.embedding.shape

  def fit(self, data):
    pass

  def transform(self, data):
    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.lower().split()
      vecs = []
      for word in tokens:
        if word in self.word2vec:
          vec = self.word2vec[word]
          vecs.append(vec)
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X

  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

### Create a Glove Vectorizer object

In [4]:
vectorizer = GloveVectorizer()

Loading word vectors from Glove...
Found 400000 word vectors.


### Apply vectorization on Training and Test data

In [5]:
Xtrain = vectorizer.fit_transform(train.content)
Ytrain = train.label

Xtest = vectorizer.transform(test.content)
Ytest = test.label

Numer of samples with no words found: 0 / 5485
Numer of samples with no words found: 0 / 2189


### Create the model, train it, print scores

In [6]:
model = RandomForestClassifier(n_estimators=200)
model.fit(Xtrain, Ytrain)

RandomForestClassifier(n_estimators=200)

### Evaluate the model

In [7]:
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

train score: 0.9992707383773929
test score: 0.9346733668341709


# 2. Word2Vec Vectorizer

### Google News Vector Model
https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing

## Create a Word2VecVectorizer Class
- __init__
- transform
- fit_transform

In [8]:
class Word2VecVectorizer:
  def __init__(self):
    print("Loading word vectors from Word2Vec...")
    self.word_vectors = KeyedVectors.load_word2vec_format(
      'GoogleNews-vectors-negative300.bin',
      binary=True
    )
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    # determine the dimensionality of vectors
    v = self.word_vectors.get_vector('king')
    self.D = v.shape[0]

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          # throws KeyError if word not found
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X


  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

### Create a Word2Vec Vectorizer object

In [9]:
vectorizer = Word2VecVectorizer()

Loading word vectors from Word2Vec...
Finished loading in word vectors


### Apply vectorization of training and test data

In [10]:
Xtrain = vectorizer.fit_transform(train.content)
Ytrain = train.label

Xtest = vectorizer.transform(test.content)
Ytest = test.label

Numer of samples with no words found: 0 / 5485
Numer of samples with no words found: 0 / 2189


### Create a model

In [11]:
model = RandomForestClassifier(n_estimators=200)
model.fit(Xtrain, Ytrain)

RandomForestClassifier(n_estimators=200)

### Evaluate the model

In [12]:
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

train score: 0.9992707383773929
test score: 0.9406121516674281
