# Word Embeddings for Text Classification

In [1]:
import pandas as pd
import os
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [2]:
def read_data(file_path,output_path):
    labels = {'pos': 1, 'neg': 0}
    data = pd.DataFrame()
    for l in ('pos', 'neg'):
        path = os.path.join(file_path, l)
        for file in os.listdir (path) :
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
            data = data.append([[txt, labels[l]]],ignore_index=True)
    data.columns = ['review', 'sentiment']
    data = shuffle(data)
    data.to_csv(output_path, sep=',', encoding='utf-8', index=False)

In [3]:
# Get train data
read_data('data/aclImdb/train/','data/train_data.csv')

In [4]:
# Get test data
read_data('data/aclImdb/test/','data/test_data.csv')

In [5]:
train_data = pd.read_csv('data/train_data.csv', sep=',', encoding='utf-8', header=0)
test_data = pd.read_csv('data/test_data.csv', sep=',', encoding='utf-8', header=0)

print(train_data['review'].head())
print(test_data['review'].head())

0    <br /><br />The movie starts out as an ordinar...
1    This is the kind of movie that wants to be goo...
2    This movie is a perfect example of a film that...
3    Trying to compare or represent this "swill" as...
4    On the basis of the preview I'd seen, I went t...
Name: review, dtype: object
0    What can be said of this independent effort be...
1    Alright, how someone can actually think this m...
2    An extremely dark and brooding show with an ex...
3    It's the 1980's and the teenagers are ready to...
4    Jack Frost 2. THE worst "horror film" I have e...
Name: review, dtype: object


In [6]:
# take smaller sample for faster runtime
train_data = train_data.sample(n=5000,replace=True)
test_data = test_data.sample(n=1000,replace=True)

## Pre Trained word embeddings

In [7]:
w2vec_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [8]:
stopwords = nltk.corpus.stopwords.words('english') 
def preprocessing(dataframe):
    vectors = pd.DataFrame()
    for doc in dataframe['review'].str.lower().str.replace('[^a-z ]', ''): 
        temp = pd.DataFrame()  
        for word in doc.split(' '):
            if word not in stopwords: 
                try:
                    word_vec = w2vec_model[word] 
                    temp = temp.append(pd.Series(word_vec), ignore_index = True) 
                except:
                    pass
        vector = temp.mean()
        vectors = vectors.append(vector, ignore_index = True)
    return vectors

In [9]:
#train_vectors = pd.DataFrame()
train_vectors = preprocessing(train_data)
train_vectors.shape

(5000, 300)

In [10]:
test_vectors = preprocessing(test_data)
test_vectors.shape

(1000, 300)

In [11]:
print(train_data['sentiment'].shape)
print(test_data['sentiment'].shape)

(5000,)
(1000,)


In [12]:
model = AdaBoostClassifier(n_estimators=500)
model.fit(train_vectors, train_data['sentiment'])
test_pred = model.predict(test_vectors)
accuracy_score(test_data['sentiment'], test_pred)

0.811