In [0]:
import os; os.chdir('/content/drive/My Drive/Colab Notebooks/Lazy courses/NLP2/pretrained_data')

In [0]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from gensim.models import KeyedVectors 

In [0]:
#dataset
# https://www.cs.umb.edu/~smimarog/textmining/datasets/

train = pd.read_csv('r8-train-all-terms.txt', header = None, sep = '\t')
test = pd.read_csv('r8-test-all-terms.txt', header = None, sep = '\t')

In [0]:
train.columns = ['label', 'content']
test.columns = ['label', 'content']

In [5]:
train.head()

Unnamed: 0,label,content
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [6]:
test.head()

Unnamed: 0,label,content
0,trade,asian exporters fear damage from u s japan rif...
1,grain,china daily says vermin eat pct grain stocks a...
2,ship,australian foreign ship ban ends but nsw ports...
3,acq,sumitomo bank aims at quick recovery from merg...
4,earn,amatil proposes two for five bonus share issue...


In [0]:
class GloveVectorizer():
    def __init__(self):
        #load in pretrained word vectors
        print("Loading GloVe pre-trained word vectors")
        word2vec = {}
        embedding = []
        idx2word = []
        with open('glove.6B.50d.txt') as f:
            #this is a space separated text file in the format
            #word vec[0] vec[1] vec[2] ...
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
                embedding.append(vec)
                idx2word.append(word)
            print(f'Found {len(word2vec)} word vectors')

        #Save for later
        self.word2vec = word2vec
        self.embedding = np.array(embedding)
        self.word2idx = {v:k for k,v in enumerate(idx2word)}
        self.V, self.D = self.embedding.shape

    def fit(self, data):
        pass

    def transform(self, data):
        X = np.zeros((len(data), self.D))
        n=0
        emptycount = 0
        for sentence in data:
            tokens = sentence.lower().split()
            vecs = []
            for word in tokens:
                if word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print(f"Number of samples with no words found: {emptycount} / {len(data)}")
        return X

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)


class Word2VecVectorizer:
    def __init__(self):
        print("Loading in word vectors..")
        self.word_vectors = KeyedVectors.load_word2vec_format(
            'GoogleNews-vectors-negative300.bin', binary=True
        )
        print("Word vectors loaded...")

    def fit(self, data):
        pass

    def transform(self, data):
        #determine the dimensionality of the vectors
        v = self.word_vectors.get_vector('king')
        self.D = v.shape[0]

        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            tokens = sentence.split()
            vecs = []
            m = 0
            for word in tokens:
                try:
                    #throw key error if word is not found
                    vec = self.word_vectors.get_vector(word)
                    vecs.append(vec)
                    m += 1
                except KeyError:
                    pass
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1

            print(f"Number of samples with no words: {emptycount} / {len(data)} ")
            return X

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data) 

In [8]:
#GloVe
vectorizer = GloveVectorizer()
Xtrain1 = vectorizer.fit_transform(train.content)
Ytrain1 = train.label

Xtest1 = vectorizer.transform(test.content)
Ytest1 = test.label

Loading GloVe pre-trained word vectors
Found 400000 word vectors
Number of samples with no words found: 0 / 5485
Number of samples with no words found: 0 / 2189


In [9]:
#create the model, train it, print scores
model = RandomForestClassifier(n_estimators=200)
model.fit(Xtrain1, Ytrain1)
print("train score:", model.score(Xtrain1, Ytrain1))
print("test score:", model.score(Xtest1, Ytest1))

train score: 0.9992707383773929
test score: 0.9323892188213796


In [12]:
#Word2vec

train = pd.read_csv('r8-train-all-terms.txt', header = None, sep = '\t')
test = pd.read_csv('r8-test-all-terms.txt', header = None, sep = '\t')
train.columns = ['label', 'content']
test.columns = ['label', 'content']


vectorizer = Word2VecVectorizer()
Xtrain2 = vectorizer.fit_transform(train.content)
Ytrain2 = train.label

Xtest2 = vectorizer.transform(test.content)
Ytest2 = test.label

#create the model, train it, print scores
model = RandomForestClassifier(n_estimators=2000)
model.fit(Xtrain2, Ytrain2)
print("train score:", model.score(Xtrain2, Ytrain2))
print("test score:", model.score(Xtest2, Ytest2))

Loading in word vectors..


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Word vectors loaded...
Number of samples with no words: 0 / 5485 
Number of samples with no words: 0 / 2189 
train score: 0.5177757520510483
test score: 0.49474645957058017
