In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords


# Load the dataset

In [None]:
train = pd.read_csv('../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip',header=0, delimiter="\t", quoting=3)
train

In [None]:
# checking a random review
train.review[0]

# Data cleaning and preprocessing

In [None]:
def review_to_words(raw_reviews):
    review_text = BeautifulSoup(raw_reviews).get_text()  # remove the html tags and abbrevations
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) # remove all non alphabet letters
    words = letters_only.lower().split() # convert to lower case and split them individually
    stops = set(stopwords.words('english')) # all stopwords that we don't want in our reviews
    meaningful_words = [i for i in words if not i in stops]
    return (" ".join(meaningful_words))

In [None]:
clean_train_reviews=[]
for i in range(len(train['review'])):
    clean_train_reviews.append(review_to_words(train['review'][i]))

In [None]:
clean_train_reviews[:3]

# Create Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", tokenizer=None,
                              preprocessor=None, stop_words = None,
                              max_features=5000)

train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

In [None]:
train_data_features.shape

In [None]:
train_data_features

In [None]:
dist = np.sum(train_data_features, axis=0)
print(dist)
# For each, print the vocabulary word and the number of times it 
# appears in the training set
# for tag, count in zip(vocab, dist):
#     print(count, tag)

# Split the Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
X=train_data_features
y = train['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1, test_size=0.2)

# Train the Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train, y_train)

# Test the model

In [None]:
y_pred_test = rf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_test)

We got around 84% accuracy

In [None]:
print(confusion_matrix(y_test, y_pred_test))

In [None]:
print(classification_report(y_test, y_pred_test))

Verdict - Our model is doing great considering it just naive

# Submit the prediction

In [None]:
test = pd.read_csv('../input/word2vec-nlp-tutorial/testData.tsv.zip',delimiter="\t", \
                   quoting=3 )
test.shape

In [None]:
clean_test_reviews = [] 
for i in range(0,len(test['review'])):
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

In [None]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [None]:
result = rf.predict(test_data_features)

In [None]:
output = pd.DataFrame(data={"id": test['id'], "sentiment":result})

In [None]:
output.to_csv('submission.csv',index=False, quoting=3)

This is it, This tutorial was for the beginners, and I will write one more notebook using deep learning.