In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
labeled_train_df = pd.read_csv('../Bags_of_words_meets_Bags_of_popcorn/datasets/labeled_train.tsv',sep="\t")
test_df = pd.read_csv('../Bags_of_words_meets_Bags_of_popcorn/datasets/testData.tsv',sep="\t")

## Simple dataframe exploration.

In [16]:
print(labeled_train_df.shape)
print(test_df.shape)

(25000, 3)
(25000, 2)


In [18]:
labeled_train_df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [20]:
test_df.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [21]:
labeled_train_df.isnull().sum()

id           0
sentiment    0
review       0
dtype: int64

In [23]:
test_df.isnull().sum()

id        0
review    0
dtype: int64

## Preprocessing the text 
including removal of stopwords, converting to lower case and removing unwanted characters

In [29]:
from nltk.corpus import stopwords
import re
import string

In [53]:
def preprocessing(text):
    text = text.lower().split()
    stopword = set(stopwords.words('english'))
    preprocessed_text = " ".join([word for word in text if word not in stopword])
    preprocessed_text = re.sub(r"<.*?>", "", preprocessed_text)
    preprocessed_text = re.sub(r"http\S+|www\S+|https\S+", "", preprocessed_text)
    preprocessed_text = re.sub(r"\d+", "", preprocessed_text)
    preprocessed_text = re.sub(f"[{re.escape(string.punctuation)}]", "", preprocessed_text)
    return(preprocessed_text)
    

In [54]:
labeled_train_df_clean = []
test_df_clean = []
for review in labeled_train_df.review:
    labeled_train_df_clean.append(preprocessing(review))
for review in test_df.review:
    test_df_clean.append(preprocessing(review))

In [59]:
print(labeled_train_df_clean[0])
print(labeled_train_df.review[0])

stuff going moment mj ive started listening music watching odd documentary there watched wiz watched moonwalker again maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mjs feeling towards press also obvious message drugs bad mkayvisually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice himthe actual feature film bit finally starts  minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond me mj overheard plans nah joe pescis character ranted wanted people know supplying drugs etc dunno maybe hates mjs musiclots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence u

In [60]:
labeled_train_df['Processed'] = labeled_train_df_clean
test_df['Processed'] = test_df_clean

In [62]:
labeled_train_df.head(5)

Unnamed: 0,id,sentiment,review,Processed
0,5814_8,1,With all this stuff going down at the moment w...,stuff going moment mj ive started listening mu...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",the classic war worlds timothy hines entertain...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film starts manager nicholas bell giving welco...
3,3630_4,0,It must be assumed that those who praised this...,must assumed praised film the greatest filmed ...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious s exp...


In [63]:
test_df.head(5)

Unnamed: 0,id,review,Processed
0,12311_10,Naturally in a film who's main themes are of m...,naturally film whos main themes mortality nost...
1,8348_2,This movie is a disaster within a disaster fil...,movie disaster within disaster film full great...
2,5828_4,"All in all, this is a movie for kids. We saw i...",all movie kids saw tonight child loved it one ...
3,7186_2,Afraid of the Dark left me with the impression...,afraid dark left impression several different ...
4,12128_7,A very accurate depiction of small time mob li...,accurate depiction small time mob life filmed ...


## Model building and Evaluation

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [66]:
X_train, X_val, y_train, y_val = train_test_split(
    labeled_train_df['Processed'], labeled_train_df['sentiment'], test_size=0.2, random_state=42)

In [67]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_val_bow = vectorizer.transform(X_val)

In [85]:
model_1 = LogisticRegression()
model_1.fit(X_train_bow, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
val_predictions_lo = model_1.predict(X_val_bow)
val_accuracy_LogisticRegression = accuracy_score(y_val, val_predictions_lo)
print("Validation Accuracy Logistic Regression:", val_accuracy_LogisticRegression)

Validation Accuracy Logistic Regression: 0.8558


In [74]:
from sklearn.naive_bayes import MultinomialNB

In [75]:
model = MultinomialNB()
model.fit(X_train_bow, y_train)

In [79]:
val_predictions_naive = model.predict(X_val_bow)

In [80]:
val_accuracy_naive_bayes = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy_naive_bayes)

Validation Accuracy: 0.8558


In [82]:
sample_sub = pd.read_csv('../Bags_of_words_meets_Bags_of_popcorn/datasets/sampleSubmission.csv')

In [87]:
output = vectorizer.transform(test_df['Processed'])

In [88]:
sub = model_1.predict(output)

In [89]:
submission = pd.DataFrame( data={"id":sample_sub['id'], "sentiment":sub} )
submission.to_csv('../Bags_of_words_meets_Bags_of_popcorn/datasets/sampleSubmission.csv', index=False)