# Word Embeddings for Text Classification

In [15]:
import pandas as pd
import os
import numpy as np
from nltk.corpus import stopwords
#Read Reviews

labels = {'pos': 1, 'neg': 0}
train_data = pd.DataFrame()
train_folder = 'data/aclImdb/train/'
for l in ('pos', 'neg'):
    path = os.path.join(train_folder, l)
    for file in os.listdir (path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        train_data = train_data.append([[txt, labels[l]]],ignore_index=True)
train_data.columns = ['review', 'sentiment']
train_data.to_csv('data/train_data.csv', sep=',', encoding='utf-8', index=False)

In [16]:
# Get test data
test_folder = 'data/aclImdb/test/'
test_data = pd.DataFrame()  
for l in ('pos', 'neg'):
    path = os.path.join(test_folder, l)
    for file in os.listdir (path) :
        with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
            txt = infile.read()
        test_data = test_data.append([[txt, labels[l]]],ignore_index=True)
test_data.columns =  ['review', 'sentiment']
test_data.to_csv('data/test_data.csv', sep=',', encoding='utf-8', index=False)

In [17]:
train_data = pd.read_csv('data/train_data.csv', sep=',', encoding='utf-8', header=0)
test_data = pd.read_csv('data/test_data.csv', sep=',', encoding='utf-8', header=0)

print(train_data['review'].head())
print(test_data['review'].head())

0    For a movie that gets no respect there sure ar...
1    Bizarre horror movie filled with famous faces ...
2    A solid, if unremarkable film. Matthau, as Ein...
3    It's a strange feeling to sit alone in a theat...
4    You probably all already know this by now, but...
Name: review, dtype: object
0    Based on an actual story, John Boorman shows t...
1    This is a gem. As a Film Four production - the...
2    I really like this show. It has drama, romance...
3    This is the best 3-D experience Disney has at ...
4    Of the Korean movies I've seen, only three had...
Name: review, dtype: object


In [18]:
# take smaller sample for faster runtime
train_data = train_data.sample(n=5000,replace=True)
test_data = test_data.sample(n=1000,replace=True)

## Pre Trained word embeddings

In [19]:
import gensim
w2vec_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [20]:
import nltk
train_vectors = pd.DataFrame() 
stopwords = nltk.corpus.stopwords.words('english') 
for doc in train_data['review'].str.lower().str.replace('[^a-z ]', ''): 
    temp = pd.DataFrame()  
    for word in doc.split(' '):
        if word not in stopwords: 
            try:
                word_vec = w2vec_model[word] 
                temp = temp.append(pd.Series(word_vec), ignore_index = True) 
            except:
                pass
    train_vector = temp.mean()
    train_vectors = train_vectors.append(train_vector, ignore_index = True) 
train_vectors.shape

(5000, 300)

In [21]:
test_vectors = pd.DataFrame()

for doc in test_data['review'].str.lower().str.replace('[^a-z ]', ''): 
    temp = pd.DataFrame()  
    for word in doc.split(' '): 
        if word not in stopwords: 
            try:
                word_vec = w2vec_model[word]
                temp = temp.append(pd.Series(word_vec), ignore_index = True)
            except:
                pass
    test_vector = temp.mean()
    test_vectors = test_vectors.append(test_vector, ignore_index = True)
test_vectors.shape

(1000, 300)

In [22]:
print(train_data['sentiment'].shape)
print(test_data['sentiment'].shape)

(5000,)
(1000,)


In [23]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=500, random_state = 1)
model.fit(train_vectors, train_data['sentiment'])
test_pred = model.predict(test_vectors)
from sklearn.metrics import accuracy_score
accuracy_score(test_data['sentiment'], test_pred)

0.788