In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/sampleSubmission.csv
/kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip


In [2]:
!pip install BeautifulSoup4

Collecting BeautifulSoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 894 kB/s 
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.2-py3-none-any.whl (33 kB)
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.3 soupsieve-2.2


In [3]:
train_df = pd.read_csv('../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip', delimiter='\t', header=0, quoting=3)
untrain_df = pd.read_csv('../input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip', delimiter='\t', header=0, quoting=3)
test_df = pd.read_csv('../input/word2vec-nlp-tutorial/testData.tsv.zip', delimiter='\t', header=0, quoting=3)

In [4]:
train_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [5]:
from bs4 import BeautifulSoup
import re

import nltk
from nltk.corpus import stopwords

In [6]:
stopwords_en = stopwords.words('english')

In [7]:
def pre_process_textdata(text):
    
    sample_text = BeautifulSoup(text.strip())
    bs_out = sample_text.get_text()
    re_out = re.sub(r'[^a-zA-Z]', ' ', bs_out)
    lower_out = re_out.lower()
    stop_words_out = [word for word in lower_out.split() if word not in stopwords_en]
    #final_review = ' '.join(stop_words_out)
    
    return stop_words_out

In [8]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
sample_text = train_df['review'][0]
print(sample_text)

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [10]:
sentences = tokenizer.tokenize(sample_text)
print(sentences)

['"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again.', 'Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.', 'Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released.', "Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring.", 'Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when

In [11]:
preprocessed_sentences = list(map(lambda x:pre_process_textdata(x), sentences))
print(preprocessed_sentences)

[['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker'], ['maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent'], ['moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released'], ['subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring'], ['may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug',

In [12]:
def get_tokenized_sentences(input_text):
    
    sentences = tokenizer.tokenize(input_text)
    preprocessed_sentences = list(map(lambda x:pre_process_textdata(x), sentences))
    
    return preprocessed_sentences

In [13]:
sentences = []

for review in train_df['review']:
    sentences += get_tokenized_sentences(review)
    
for review in untrain_df['review']:
    sentences += get_tokenized_sentences(review)
    
print(len(sentences))



795538


In [14]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, size=300, min_count=40, window=10, sample=1e-3)
model.init_sims(replace=True)

In [15]:
model.doesnt_match(['man', 'woman', 'dog', 'kid'])

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'dog'

In [16]:
model.most_similar('germany')

  """Entry point for launching an IPython kernel.


[('soviet', 0.862610399723053),
 ('europe', 0.8514181971549988),
 ('poland', 0.8375749588012695),
 ('russia', 0.8347918391227722),
 ('wwi', 0.8184590339660645),
 ('occupation', 0.8145135641098022),
 ('wwii', 0.8134337067604065),
 ('spain', 0.8059555888175964),
 ('revolution', 0.80049729347229),
 ('france', 0.7972924709320068)]

In [17]:
model['king'].shape

  """Entry point for launching an IPython kernel.


(300,)

In [18]:
import numpy as np

In [19]:
word_index_word2vecmodel = set(model.wv.index2word)

In [20]:
def makefeaturevector(words):
    
    feature_vector = np.zeros(300, dtype='float32')
    nwords = 0
    
    for word in words:
        if word in word_index_word2vecmodel:
            nwords += 1
            feature_vector = np.add(feature_vector, model[word])
            
    feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

In [21]:
def makefeaturevec_reviews(reviews):
    
    featurevec_reviews = np.zeros((len(reviews), 300), dtype='float32')
    for idx, review in enumerate(reviews):

        featurevec_reviews[idx] = makefeaturevector(review)
        
    return featurevec_reviews

In [22]:
preprocessed_traindata = list(map(lambda x:pre_process_textdata(x), train_df['review']))
preprocessed_testdata = list(map(lambda x:pre_process_textdata(x), test_df['review']))

In [23]:
traindata_features = makefeaturevec_reviews(preprocessed_traindata)
testdata_features = makefeaturevec_reviews(preprocessed_testdata)

  if __name__ == '__main__':


In [24]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100).fit(traindata_features, train_df['sentiment'])

In [25]:
result = model.predict(testdata_features)

# Write the test results 
output = pd.DataFrame( data={"id":test_df["id"], "sentiment":result} )
output.to_csv( "submission.csv", index=False, quoting=3 )