## Module Imports

In [6]:
import numpy as np
import pandas as pd
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import scipy.sparse

# set seed for reproducible results
seed = 42

## Read, clean and remove stop words from data

In [2]:
reviews_train = []
for line in open('./aclImdb/movie_data/full_train.txt', 'r',encoding="utf8"):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('./aclImdb/movie_data/full_test.txt', 'r',encoding="utf8"):
    
    reviews_test.append(line.strip())
    
target = [1 if i < 12500 else 0 for i in range(25000)]


import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
english_stop_words = stopwords.words('english')

def remove_stop_words_stemmer(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in removed_stop_words]

cleaned_train = remove_stop_words_stemmer(reviews_train_clean)
cleaned_test = remove_stop_words_stemmer(reviews_test_clean)

## One-hot representation - Binary word occurence (Bernoulli)

In [None]:
cv = CountVectorizer(binary=True,max_features=3000)
cv.fit(cleaned_train)
X = cv.transform(cleaned_train)
X_test = cv.transform(cleaned_test)

# Test - Validation Split 
X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75, random_state=seed
)

# Bag of Words: Save to csv (from scipy sparse matrix representation)
x = pd.DataFrame.sparse.from_spmatrix(X_val)
x.to_csv('X_test_onehot.csv',index=False, header=False)

x = pd.DataFrame.sparse.from_spmatrix(X_train)
x.to_csv('X_train_onehot.csv',index=False, header=False)

y_train_df = pd.DataFrame(data={"col1": y_train})
y_train_df.to_csv("./y_train_onehot.csv", sep=',',index=False, header=False)

y_test_df = pd.DataFrame(data={"col1": y_val})
y_test_df.to_csv("./y_test_onehot.csv", sep=',',index=False, header=False)

## Bag of words representation - Word occurence counts (Multinomial)

In [None]:
# Binary = False will make sure counts show up
cv = CountVectorizer(binary=False, max_features=3000)
cv.fit(cleaned_train)
X = cv.transform(cleaned_train)
X_test = cv.transform(cleaned_test)

# Test - Validation Split 
X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75, random_state=seed
)

# Bag of Words: Save to csv (from scipy sparse matrix representation)
x = pd.DataFrame.sparse.from_spmatrix(X_val)
x.to_csv('X_test_bow.csv',index=False, header=False)

x = pd.DataFrame.sparse.from_spmatrix(X_train)
x.to_csv('X_train_bow.csv',index=False, header=False)

y_train_df = pd.DataFrame(data={"col1": y_train})
y_train_df.to_csv("./y_train_bow.csv", sep=',',index=False, header=False)

y_test_df = pd.DataFrame(data={"col1": y_val})
y_test_df.to_csv("./y_test_bow.csv", sep=',',index=False, header=False)

## Remove below this

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75, random_state=42
)

## Bag of Words: Save to csv (from scipy sparse matrix representation)

In [None]:
import scipy.sparse

x = pd.DataFrame.sparse.from_spmatrix(X_val)
x.to_csv('X_test_bow.csv',index=False)

x = pd.DataFrame.sparse.from_spmatrix(X_train)
x.to_csv('X_train_bow.csv',index=False)

y_train_df = pd.DataFrame(data={"col1": y_train})
y_train_df.to_csv("./y_train_bow.csv", sep=',',index=False)

y_test_df = pd.DataFrame(data={"col1": y_val})
y_test_df.to_csv("./y_test_bow.csv", sep=',',index=False)