In [150]:
# Importing important libraries and functions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

In [151]:
# Uploading training data, data to be predicted and sample submission file
train_data = pd.read_csv('labeledTrainData.tsv',  delimiter="\t", quoting=3)
test_data = pd.read_csv('testData.tsv', delimiter = "\t", quoting= 3 )
submission1 = pd.read_csv("sampleSubmission.csv")

In [152]:
# Analyzing the training data
train_data.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [153]:
# Cheking for any missing values in training data
train_data.isnull().sum().sum()     # Zero missing values

0

In [154]:
# Since we have three columns in which the sentiment column is target variable and id is not required for training so we will pre process the training data containing reviews only
# Each review is in the form of a paragraph with HTML tags, punctuations, numbers and stopwords such as is, are am etc so we need to clean them 
# For cleaning below is the function which uses beautifulsoup, regular expression and natural language toolkit to achieve the above 

# creating train_y as target variable in the form of an array
train_y = np.array(train_data["sentiment"])

# To eliminate stop word we need to download its vacab from nltk
nltk.download('stopwords')

def process(review):
   # review without HTML tags
   review = BeautifulSoup(review).get_text()
   # review without punctuation and numbers
   review = re.sub("[^a-zA-Z]",' ',review)
   # converting into lowercase and splitting to eliminate stopwords
   review = review.lower()
   review = review.split()
   # review without stopwords
   swords = set(stopwords.words("english"))                      # conversion into set for fast searching
   review = [w for w in review if w not in swords]               
   # joining of splitted paragraph by spaces and return
   return(" ".join(review))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [155]:
# processing the training data with the help of function defined above

train_x = []
for r in range(len(train_data["review"])):
  if (r+1)%1000 == 0:
    print("No of reviews processed =", r+1)
  train_x.append(process(train_data["review"][r]))


No of reviews processed = 1000
No of reviews processed = 2000
No of reviews processed = 3000
No of reviews processed = 4000
No of reviews processed = 5000
No of reviews processed = 6000
No of reviews processed = 7000
No of reviews processed = 8000
No of reviews processed = 9000
No of reviews processed = 10000
No of reviews processed = 11000
No of reviews processed = 12000
No of reviews processed = 13000
No of reviews processed = 14000
No of reviews processed = 15000
No of reviews processed = 16000
No of reviews processed = 17000
No of reviews processed = 18000
No of reviews processed = 19000
No of reviews processed = 20000
No of reviews processed = 21000
No of reviews processed = 22000
No of reviews processed = 23000
No of reviews processed = 24000
No of reviews processed = 25000


In [156]:
# Now we have our processed and cleaned training set but it is in the form of text but to train we need to convert it into numerical data
# For that we will use bag of words which is based on the frequency that each word occur in a review within training data and we will go for 5000 most common words

# Initializing the countvectorizer which is a sklearn tool for bag of words
vectorizer = CountVectorizer( max_features = 5000 )
# Now we will use fit_transform which fits the model to learn vocabulary for 5000 most common words and then transform the training data into feature vectors
train_x = vectorizer.fit_transform(train_x)
# conversion into array
train_x = train_x.toarray()


In [157]:
# Final train_x and train_y
train_x.shape, train_y.shape

((25000, 5000), (25000,))

In [158]:
# Analyzing data to be predicted
test_data.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [159]:
# Checking for any missing values
test_data.isnull().sum().sum()   # No missing values

0

In [160]:
# processing the data to be predicted with the help of function defined above

test = []
for r in range(len(test_data["review"])):
  if (r+1)%1000 == 0:
    print("No of reviews processed =", r+1)
  test.append(process(test_data["review"][r]))


No of reviews processed = 1000
No of reviews processed = 2000
No of reviews processed = 3000
No of reviews processed = 4000
No of reviews processed = 5000
No of reviews processed = 6000
No of reviews processed = 7000
No of reviews processed = 8000
No of reviews processed = 9000
No of reviews processed = 10000
No of reviews processed = 11000
No of reviews processed = 12000
No of reviews processed = 13000
No of reviews processed = 14000
No of reviews processed = 15000
No of reviews processed = 16000
No of reviews processed = 17000
No of reviews processed = 18000
No of reviews processed = 19000
No of reviews processed = 20000
No of reviews processed = 21000
No of reviews processed = 22000
No of reviews processed = 23000
No of reviews processed = 24000
No of reviews processed = 25000


In [161]:
# Converting the text data in data to be predicted into bag of words feature vectors
test = vectorizer.transform(test)
# conversion into array
test = test.toarray()

In [162]:
# Final data to be predicted
test.shape

(25000, 5000)

In [163]:
# Creating a model, fitting and seeing the performance on training data

model = RandomForestClassifier(n_estimators = 100)
model.fit(train_x, train_y)
train_predict = model.predict(train_x)
AUC = roc_auc_score(train_y, train_predict)

In [164]:
# printing AUC
AUC   

1.0

In [171]:
# prediction on data to be predicted
submission2 = model.predict(test)

In [166]:
# Conversion of predicted arry to dataframe and combining with sample submission
submission2 = pd.DataFrame(submission2)
submission = pd.concat([submission1, submission2], axis =1)

In [167]:
# Removing the sentiment column from sample submission and renaming the predicted column from 0 to "sentiment"
submission.drop(columns=["sentiment"], inplace=True)
submission = submission.rename(columns = {0 :"sentiment"})

In [168]:
# Final submission file
submission

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1
...,...,...
24995,2155_10,1
24996,59_10,1
24997,2531_1,0
24998,7772_8,1


In [169]:
# saving the final dubmission file in csv format
submission.to_csv("Submission.csv")