In [None]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 0-
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**This tutorial focuses on Word2Vec for sentiment analysis.**

In [None]:
train = pd.read_csv("/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip",
                   header=0, delimiter="\t", quoting=3)
train.head()

In [None]:
train.shape

In [None]:
train.columns.values

In [None]:
print(train["review"][0])

In [None]:
from bs4 import BeautifulSoup

In [None]:
example = BeautifulSoup(train['review'][0])
print(example.get_text())

In [None]:
import re

letters_only = re.sub("[^a-zA-Z]", " ", example.get_text())
print(letters_only)

In [None]:
lower_case = letters_only.lower()
words = lower_case.split()

In [None]:
import nltk
# nltk.download()

In [None]:
from nltk.corpus import stopwords
print(stopwords.words("english"))

In [None]:
words = [w for w in words if not w in stopwords.words("english")]
print(words)

In [None]:
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review).get_text()
    letters_only = re.sub("[^a-zA-Z]", "", review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return (" ".join(meaningful_words))

In [None]:
num_reviews = train["review"].size
clean_train_reviews = []
for i in np.arange(0, num_reviews):
    clean_train_reviews.append(review_to_words(train["review"][i]))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer='word',
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=5000)

In [None]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()
print(train_data_features.shape)

In [None]:
vocab = vectorizer.get_feature_names()
# print(vocab)

In [None]:
dist = np.sum(train_data_features, axis=0)
# for tag, count in zip(vocab, dist):
#     print(count, tag)

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features, train["sentiment"])

In [None]:
test = pd.read_csv("/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip",
                   header=0, delimiter="\t", quoting=3)
print(test.shape)

In [None]:
num_reviews = len(test["review"])
clean_test_reviews = []

In [None]:
for i in np.arange(0, num_reviews):
    if ((i+1) % 1000 == 0):
        print("Review %d of %d\n" % (i+1, num_reviews))
        
    clean_review = review_to_words(test["review"][i])
    clean_test_reviews.append(clean_review)

In [None]:
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [None]:
result = forest.predict(test_data_features)
output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
output.to_csv("Bag_of_words_model.csv", index=False, quoting=3)