In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import numpy as np
from gensim.models.word2vec import Word2Vec

In [5]:
def load_data(fname):
    datas = {'labeledTrainData': 'labeledTrainData.tsv', 
            'unlabeledTrainData': 'unlabeledTrainData.tsv', 
            'testData': 'testData.tsv'
            }
    if fname not in datas:
        raise ValueError(fname)
    data_df = pd.read_csv(datas[fname], delimiter='\t', escapechar='\\')
    print('number of {}\'s reviews: {}\n'.format(fname, len(data_df)))
    return data_df

In [3]:
def review2words(review, remove_stopwords=False):
    review_text = BeautifulSoup(review, 'html.parser').get_text()
    words = re.sub('[^a-zA-Z]', ' ', review_text).lower().split()
    if remove_stopwords:
        stopwords_set = set(stopwords.words('english'))
        words = [w for w in words if w not in stopwords_set]
    return words

In [4]:
model = Word2Vec.load('300features-10context-40minwords')

In [6]:
train_df = load_data('labeledTrainData')
train_df.head()

number of labeledTrainData's reviews: 25000



Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [7]:
def review2vec(review):
    words = review2words(review, remove_stopwords=True)
    review_vec = np.array([model[w] for w in words if w in model])
    return pd.Series(np.mean(review_vec, axis=0)) # the same shape of word2vec

In [10]:
# make training data to vec
train_vecs_df = train_df.review.apply(review2vec)
train_vecs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.765362,0.169056,-0.119254,-0.255037,0.068678,0.139467,0.075863,-0.127629,-0.020057,-0.148682,...,0.493227,0.053989,-0.289202,-0.189775,0.096707,-0.079156,-0.454016,-0.175191,0.060883,0.142735
1,0.639315,0.162649,-0.090817,-0.440241,-0.225265,0.38664,-0.183629,0.335586,-0.292449,-0.273786,...,0.268585,0.416167,0.39838,-0.207033,-0.164725,-0.13003,-0.541667,0.022986,0.083346,-0.027541
2,-0.149412,0.028493,-0.187868,0.149881,-0.04422,0.17272,-0.20514,0.016858,0.067282,-0.209686,...,0.35265,-0.001417,-0.055466,-0.213512,-0.006312,0.158624,-0.259476,0.131019,0.068329,-0.097889
3,0.471528,-0.111306,-0.063871,-0.19594,-0.155415,0.09713,-0.090134,0.032448,-0.05255,-0.167758,...,0.216325,0.270397,-0.212374,-0.455067,-0.064654,-0.025269,-0.217929,-0.102075,0.094986,0.193019
4,0.120104,-0.036678,-0.228993,-0.033221,0.037108,0.247308,0.004479,-0.119206,0.166998,-0.2236,...,0.260528,-0.012126,-0.097311,-0.087746,0.147706,-0.025683,-0.0308,0.03668,0.081897,0.046309


In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_vecs_df, train_df.sentiment)

In [13]:
test_df = load_data('testData')
test_df.head()

number of testData's reviews: 25000



Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [14]:
# make testing data to vec
test_vecs_df = test_df.review.apply(review2vec)
test_vecs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.394929,-0.066407,-0.292188,-0.300689,-0.068416,0.167103,-0.174729,-0.131822,0.269697,-0.191407,...,0.064424,0.190904,-0.281705,-0.518753,0.085965,-0.210084,-0.338349,0.033936,0.016524,-0.104246
1,0.871469,-0.048788,-0.1363,-0.437894,0.033967,0.011201,-0.092705,0.105032,0.065724,-0.082746,...,0.276626,0.211623,-0.407444,0.013311,0.287256,-0.018219,-0.474055,0.004771,-0.07449,0.020558
2,0.777464,0.103418,0.090207,-0.30557,-0.119563,0.242119,-0.122845,-0.095536,-0.045884,-0.063626,...,0.289112,0.089045,-0.570924,-0.353143,-0.024407,-0.162625,-0.335202,0.046614,-0.135819,0.169821
3,0.787207,-0.00714,-0.31073,-0.155438,0.130239,0.206179,-0.363051,-0.072915,0.009724,-0.29106,...,0.371374,0.23961,-0.340428,-0.260182,0.26023,0.00784,-0.305682,0.127472,0.040071,-0.121198
4,0.199518,-0.160906,-0.101723,-0.378352,-0.350101,0.277474,-0.201799,-0.003923,0.061855,-0.232506,...,0.315938,0.364808,-0.08731,-0.012514,-0.15478,-0.228036,-0.288455,0.060302,-0.108066,0.091404


In [15]:
# predicting on testing data
test_predic = forest.predict(test_vecs_df)

In [18]:
output = pd.DataFrame({'id': test_df.id, 'sentiment': test_predic})
output.to_csv('word2vec-model-output.csv', index=False)

In [19]:
train_vecs_np = train_vecs_df.values
test_vecs_np = test_vecs_df.values

In [22]:
from sklearn.neural_network import MLPClassifier

In [26]:
mlp_clf1 = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='lbfgs', alpha=1e-4, verbose=True)
%time mlp_clf1 = mlp_clf1.fit(train_vecs_df, train_df.sentiment)

CPU times: user 43.5 s, sys: 5.32 s, total: 48.8 s
Wall time: 1min


In [27]:
test_predic1 = mlp_clf1.predict(test_vecs_df)

In [28]:
output1 = pd.DataFrame({'id': test_df.id, 'sentiment': test_predic})
output1.to_csv('word2vec-mlp-output.csv', index=False)

In [29]:
mlp_clf2 = MLPClassifier(hidden_layer_sizes=(100,50), activation='relu', solver='lbfgs', alpha=1e-4, verbose=True)
%time mlp_clf2 = mlp_clf2.fit(train_vecs_df, train_df.sentiment)


CPU times: user 53.5 s, sys: 5.24 s, total: 58.7 s
Wall time: 1min 11s


NameError: name 'mlpmlp_clf2_clf1' is not defined

In [31]:
test_predic2 = mlp_clf2.predict(test_vecs_df)

In [32]:
output1 = pd.DataFrame({'id': test_df.id, 'sentiment': test_predic})
output1.to_csv('word2vec-mlp-output2.csv', index=False)