In [None]:
#importing pandas
import pandas as pd
import os
print(os.listdir("../input"))

In [None]:
#reading the file
train = pd.read_csv("../input/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [None]:
#exploration again
train.head()

In [None]:
#exploring
print(train.shape)

In [None]:
#exploring again
print(train.columns.values)

In [None]:
#viewing the structure of data we need to work on
print(train.review[0])

In [None]:
#using BeautifulSoup to clean data initially
from bs4 import BeautifulSoup
##the html tags and comments etc are reomved and stored as example1
example1 = BeautifulSoup(train.review[0],"html.parser")

In [None]:
##by using .get_text() method we can see the only texts in the html document
#it is also better as compared to the raw html doc
print(example1.get_text())

In [None]:
#removing numbers
import re
# a '^' within square brackets searches anything other than the one on it
# hence here it matches everything numbers and punctuations etc , leaving only the words
letters_only = re.sub("[^a-zA-Z]"," ",example1.get_text())
print(letters_only)

In [None]:
## changing all the words to lowercase to create a bag of words later
lower_case = letters_only.lower()
# the whole doc is now split to create an array from which most common words called "stop words" will be removed
words = lower_case.split()

In [None]:
#importing stopwords from nltk
from nltk.corpus import stopwords
#some stopwords in english language are
print(stopwords.words("english"))

In [None]:
##removing most common words from doc
words = [w for w in words if w not in stopwords.words("english")]
print(words)

In [None]:
# the above code cleans only one review , let's make a function from above code that can clean all the reviews
def review_to_words(raw_review):
    #remove html using BeautifulSoup
    review_text = BeautifulSoup(raw_review,"html.parser").get_text()
    #removing raw letters,numbers,punctuations
    letters_only = re.sub("[^a-zA-Z]"," ",review_text)
    #creating an array , resolving whitespaces
    words = letters_only.lower().split()
    #create an array of stopwords so that we don't have to access corpus to search for a stopword
    stop = set(stopwords.words("english"))
    #removing stopwords from the raw_review
    meaningful_words = [w for w in words if w not in stop]
    #return a string with only the words that are important
    return(" ".join(meaningful_words))

In [None]:
#checking if our function works properly
trial_review = review_to_words(train.review[0])
print(trial_review)

In [None]:
#finding the number of reviews
num_reviews = train.review.size
print("the number of reviews>>>>>>> :",num_reviews)

In [None]:
#storing all reviews at one place
clean_train_reviews = []
for i in range(num_reviews):
    clean_train_reviews.append(review_to_words(train.review[i]))
    print("cleaned review number> ",i,"Done")
print("cleaning is completed")

In [None]:
print("we are Creating a bag of words . . . . . ")
#import CountVectorizer to create token counts of document
from sklearn.feature_extraction.text import CountVectorizer
#initializing the parameters as None so that we can write and manipulate the processing by our own
vectorizer = CountVectorizer(analyzer="word",
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=5000)
#train the classifer using fit_transform() method
train_data_features = vectorizer.fit_transform(clean_train_reviews)
#change the classifier into array
train_data_features = train_data_features.toarray()

In [None]:
print(train_data_features.shape)

In [None]:
#see all the features names
vocab = vectorizer.get_feature_names()
print(" , ".join(vocab[0:10])," . . . . "," , ".join(vocab[-10:]))

In [None]:
import numpy as np
#frequency of each word is found using np.sum()
dist = np.sum(train_data_features,axis=0)
ct = 0
for tag,count in zip(vocab,dist):
    print(tag,":",count,end=" ")

In [None]:
startswith = []
for val in vocab:
    if(val[0] not in startswith):
        startswith.append(val[0])
print(startswith)

In [None]:
#counting the total numbers of words starting
counts = np.zeros((len(startswith)),dtype=np.int)
for val in vocab:
    index = startswith.index(val[0])
    counts[index] += 1
print(counts)

In [None]:
import matplotlib.pyplot as plt
plt.figure(1,figsize=(15,5))
plt.plot(counts)
nums = [i for i in range(26)]
plt.xticks(nums,startswith)
plt.grid()
plt.ylabel("frequency")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
print("fitting RandomForest . . . ")
forest = forest.fit(train_data_features,train["sentiment"])

In [None]:
from sklearn.naive_bayes import MultinomialNB
naive = MultinomialNB()
print("fitting NaiveBayes . . . ")
naive.fit(train_data_features,train["sentiment"])

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier(n_estimators = 100)
print("fitting AdaBoost . . . ")
adaboost.fit(train_data_features,train["sentiment"])
print("fitting complete.")

In [None]:
test = pd.read_csv("../input/testData.tsv",header=0,delimiter="\t",quoting=3)
print("shape :",test.shape)
print(test.info())

In [None]:
num_reviews = len(test["review"])
clean_test_reviews = []
print("Cleaning and parsing . . . . ")
for i in range(0,num_reviews):
    clean_review = review_to_words(test["review"][i])
    clean_test_reviews.append(clean_review)
print("processing complete.")

In [None]:
test_data_features = vectorizer.fit_transform(clean_test_reviews)
test_data_features = test_data_features.toarray()
print("predicting using RandomForest . . . ..")
result1 = forest.predict(test_data_features)
print("predicting using Naive Bayes . . ... ")
result2 = naive.predict(test_data_features)
print("predicting using AdaBoost . . ... ")
result3 = adaboost.predict(test_data_features)
print("process completed :) ")

In [None]:
result = result1+result2+result3
for i in range(25000):
    if(result[i]==1):
        result[i]=0
    elif(result[i]==2):
        result[i]=1
    elif(result[i]==3):
        result[i]=1
output = pd.DataFrame(data = {"id":test["id"],"sentiment":result})
output.to_csv("Submit_output.csv", index=False, quoting=3)