In [3]:
import numpy as np
import pandas as pd
import os , re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import spacy
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

In [14]:
train_review = []
for line in open(r"movie_data/full_train.txt",'r',encoding='utf-8'):
    train_review.append(line.strip())
test_review = []
for line in open(r"movie_data/full_test.txt",'r',encoding='utf-8'):
    test_review.append(line.strip())

target = [1 if i < 12500 else 0 for i in range(25000)]

In [6]:
no_space = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
rp1 = ""
rp2 = " "

def preprocess(reviews):
    reviews = [no_space.sub(rp1,line.lower()) for line in reviews]
    reviews = [space.sub(rp2,line) for line in reviews]
    return reviews

train_clean = preprocess(train_review)
test_clean = preprocess(test_review)

In [15]:
cv = CountVectorizer(binary=True)
x = cv.fit_transform(train_clean)
x_test = cv.transform(test_clean)
x_train,x_val,y_train,y_val = train_test_split(x,target,test_size=0.15)

In [16]:
lg = LogisticRegression(C=0.05)
lg.fit(x_train,y_train)
lg_pred = lg.predict(x_val)
print(f"Accuracy: {accuracy_score(y_val,lg_pred)}")

Accuracy: 0.8938666666666667


# Removing Stop Words

In [None]:
def rm_sw(corpus):
    rm = []
    for r in corpus:
        rm.append(" ".join([x for x in r.split() if x not in stopwords.words("english")]))
    return rm

sw_train = rm_sw(train_clean)
sw_test = rm_sw(test_clean)
cv = CountVectorizer(binary=True)
cv.fit(sw_train)
x = cv.transform(sw_test)
x_test = cv.transform(sw_test)
x_train, x_val, y_train, y_val = train_test_split( x, target, train_size = 0.75)
lr = LogisticRegression(C=0.05)
lr.fit(x_train, y_train)
print ("Accuracy: %s" 
       % (accuracy_score(y_val, lr.predict(x_val))))

In [20]:
stemmer = PorterStemmer()

In [24]:
def stemming(corpus):
    return [" ".join(stemmer.stem(x) for x in line.split()) for line in corpus]
stemmed_train = stemming(train_clean)
stemmed_test = stemming(test_clean)
cv = CountVectorizer(binary=True)
cv.fit(stemmed_train)
X = cv.transform(stemmed_train)
X_test = cv.transform(stemmed_test)
X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)
lr = LogisticRegression(C=0.05)
lr.fit(X_train, y_train)
print ("Accuracy: %s" 
       % (accuracy_score(y_val, lr.predict(X_val))))   

Accuracy: 0.8776


In [27]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train_clean)
X = tfidf_vectorizer.transform(train_clean)
X_test = tfidf_vectorizer.transform(test_clean)
X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)
lr = LogisticRegression(C=0.05)
lr.fit(X_train, y_train)
print ("Accuracy: %s" 
       % accuracy_score(y_val, lr.predict(X_val)))

Accuracy: 0.81968


In [29]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(train_clean)
X = ngram_vectorizer.transform(train_clean)
X_test = ngram_vectorizer.transform(test_clean)
X_train, X_val, y_train, y_val = train_test_split( X, target, train_size = 0.75)
svm = LinearSVC(C=1.0)
svm.fit(X_train, y_train)
print ("Accuracy: %s" 
       % accuracy_score(y_val, svm.predict(X_val)))



Accuracy: 0.89184


