In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train = pd.read_csv("../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip", header=0, delimiter="\t", quoting=3)
print(train.shape)
train.head()

In [None]:
print(train['review'][1])

In [None]:
from bs4 import BeautifulSoup
example1 = BeautifulSoup(train['review'][1])
example1.get_text()

In [None]:
import re
letters_only = re.sub("[^a-zA-Z]", " ", example1.get_text())
print(letters_only)

In [None]:
lower_case = letters_only.lower()
words = lower_case.split()

In [None]:
import nltk
from nltk.corpus import stopwords
print((stopwords.words("english")))

In [None]:
words = [w for w in words if w not in stopwords.words("english")]
print(words)

In [None]:
meaningful_words = " ".join(words)
meaningful_words

In [None]:
def review_to_words(review):
    
    review_text = BeautifulSoup(review).get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    stop_words = set(stopwords.words("english"))
    final_words = [w for w in words if w not in stop_words]
    return (" ".join(final_words))
        

In [None]:
clean_review = review_to_words(train["review"][1])
clean_review

In [None]:
num_reviews = len(train["review"])

clean_train_reviews = []

for i in range(0, num_reviews):
    clean_train_reviews.append(review_to_words(train["review"][i]))
    if ((i+1) % 1000 == 0):
        print("Review %d of %d" % (i+1, num_reviews))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 5000)
train_features = vectorizer.fit_transform(clean_train_reviews)
train_features = train_features.toarray()

In [None]:
vocab = vectorizer.get_feature_names()
#print(vocab)

In [None]:
dist = np.sum(train_features, axis = 0)

sum = 0
for tag, count in zip(vocab, dist):
    print(tag, count)
    sum += 1
    if sum == 20:
        break

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(verbose = 0, random_state = 0)

forest.fit(train_features, train['sentiment'])

In [None]:
test = pd.read_csv("../input/word2vec-nlp-tutorial/testData.tsv.zip", header=0, delimiter="\t", quoting=3)
print(test.shape)
test.head()

In [None]:
num_test_reviews = len(test)
clean_test_reviews = []

for i in range(0, num_test_reviews):
    clean_test_reviews.append(review_to_words(test["review"][i]))
    if ((i+1) % 5000 == 0):
        print("Review %d of %d" % (i+1, num_test_reviews))


In [None]:
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [None]:
result = forest.predict(test_data_features)

In [None]:
output = pd.DataFrame(data={'id':test['id'], 'sentiment':result})
output.to_csv("submission.csv", index = False, quoting=3)