In [1]:
import pandas as pd
import json
from glob import glob
from tqdm import tqdm 
import contractions
from langdetect import detect
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
import nltk
import string

In [2]:
path_to_pos_reviews = "../data/raw/imdb/train/pos"
path_to_neg_reviews = "../data/raw/imdb/train/neg"
pos_reviews = glob(path_to_pos_reviews+"/*txt")
neg_reviews = glob(path_to_neg_reviews+"/*txt")

std_reviews_data_path = "../data/processed/imdb_reviews_standardised.csv"
pre_processed_data_path = "../data/processed/imdb_reviews_pre_processed.csv"

In [3]:
all_entries = []
all_files = pos_reviews + neg_reviews
for file in tqdm(all_files):
    f_name = file.split("/")[-1]
    f_name = f_name.replace(".txt", "")
    _id,u_ratings = map(int, f_name.split("_"))
    with open(file) as review:
        u_review = review.read()
    entry = {}
    entry["id"] = _id
    entry["ratings"] = 0 if u_ratings < 5 else 1 # postive: 1, negative: 0
    entry["review"] = u_review
    all_entries.append(entry)
    pass

review_df = pd.DataFrame(all_entries)
review_df.to_csv(std_reviews_data_path, index=False)
review_df.count()

100%|██████████| 25000/25000 [00:08<00:00, 2883.32it/s]


id         25000
ratings    25000
review     25000
dtype: int64

In [4]:
review_df = pd.read_csv(std_reviews_data_path)
review_df.head()

Unnamed: 0,id,ratings,review
0,4715,1,For a movie that gets no respect there sure ar...
1,12390,1,Bizarre horror movie filled with famous faces ...
2,8329,1,"A solid, if unremarkable film. Matthau, as Ein..."
3,9063,1,It's a strange feeling to sit alone in a theat...
4,3092,1,"You probably all already know this by now, but..."


## Data Preprocessing: Cleaning the reviews

 - Expanding Contractions
 - Language Detection
 - Tokenization
 - Converting all Characters to Lowercase
 - Removing Punctuations
 - Removing Stopwords
 - Parts of Speech Tagging
 - Lemmatization

### Contrations expansion

In [5]:
review_df["c_review"] = review_df["review"].apply(lambda x: [
    contractions.fix(word) for word in x.split()])
review_df["exp_review"] = review_df["c_review"].apply(lambda x: ' '.join(x))
review_df = review_df.drop(columns=["c_review"])
review_df.head()

Unnamed: 0,id,ratings,review,exp_review
0,4715,1,For a movie that gets no respect there sure ar...,For a movie that gets no respect there sure ar...
1,12390,1,Bizarre horror movie filled with famous faces ...,Bizarre horror movie filled with famous faces ...
2,8329,1,"A solid, if unremarkable film. Matthau, as Ein...","A solid, if unremarkable film. Matthau, as Ein..."
3,9063,1,It's a strange feeling to sit alone in a theat...,it is a strange feeling to sit alone in a thea...
4,3092,1,"You probably all already know this by now, but...","You probably all already know this by now, but..."


### Language Detection

In [6]:
langs = []
for sent in tqdm(review_df['exp_review']):
    lang = detect(sent)
    langs.append(lang)
    pass

review_df['langs'] = langs
# Retain english reviews only 
review_df = review_df[review_df['langs'] == "en"]
review_df.count()

100%|██████████| 25000/25000 [02:36<00:00, 160.12it/s]


id            25000
ratings       25000
review        25000
exp_review    25000
langs         25000
dtype: int64

### Tokenisation

In [7]:
review_df['tokenized'] = review_df['exp_review'].apply(word_tokenize)
review_df.head()

Unnamed: 0,id,ratings,review,exp_review,langs,tokenized
0,4715,1,For a movie that gets no respect there sure ar...,For a movie that gets no respect there sure ar...,en,"[For, a, movie, that, gets, no, respect, there..."
1,12390,1,Bizarre horror movie filled with famous faces ...,Bizarre horror movie filled with famous faces ...,en,"[Bizarre, horror, movie, filled, with, famous,..."
2,8329,1,"A solid, if unremarkable film. Matthau, as Ein...","A solid, if unremarkable film. Matthau, as Ein...",en,"[A, solid, ,, if, unremarkable, film, ., Matth..."
3,9063,1,It's a strange feeling to sit alone in a theat...,it is a strange feeling to sit alone in a thea...,en,"[it, is, a, strange, feeling, to, sit, alone, ..."
4,3092,1,"You probably all already know this by now, but...","You probably all already know this by now, but...",en,"[You, probably, all, already, know, this, by, ..."


### Converting all Characters to Lowercase

In [8]:
review_df['lowercase'] = review_df['tokenized'].apply(lambda x: [word.lower() for word in x])
review_df.head()

Unnamed: 0,id,ratings,review,exp_review,langs,tokenized,lowercase
0,4715,1,For a movie that gets no respect there sure ar...,For a movie that gets no respect there sure ar...,en,"[For, a, movie, that, gets, no, respect, there...","[for, a, movie, that, gets, no, respect, there..."
1,12390,1,Bizarre horror movie filled with famous faces ...,Bizarre horror movie filled with famous faces ...,en,"[Bizarre, horror, movie, filled, with, famous,...","[bizarre, horror, movie, filled, with, famous,..."
2,8329,1,"A solid, if unremarkable film. Matthau, as Ein...","A solid, if unremarkable film. Matthau, as Ein...",en,"[A, solid, ,, if, unremarkable, film, ., Matth...","[a, solid, ,, if, unremarkable, film, ., matth..."
3,9063,1,It's a strange feeling to sit alone in a theat...,it is a strange feeling to sit alone in a thea...,en,"[it, is, a, strange, feeling, to, sit, alone, ...","[it, is, a, strange, feeling, to, sit, alone, ..."
4,3092,1,"You probably all already know this by now, but...","You probably all already know this by now, but...",en,"[You, probably, all, already, know, this, by, ...","[you, probably, all, already, know, this, by, ..."


### Removing Punctuations

In [9]:
punc = string.punctuation
review_df['no_punc'] = review_df['lowercase'].apply(lambda x: [word for word in x if word not in punc])
review_df.head()

Unnamed: 0,id,ratings,review,exp_review,langs,tokenized,lowercase,no_punc
0,4715,1,For a movie that gets no respect there sure ar...,For a movie that gets no respect there sure ar...,en,"[For, a, movie, that, gets, no, respect, there...","[for, a, movie, that, gets, no, respect, there...","[for, a, movie, that, gets, no, respect, there..."
1,12390,1,Bizarre horror movie filled with famous faces ...,Bizarre horror movie filled with famous faces ...,en,"[Bizarre, horror, movie, filled, with, famous,...","[bizarre, horror, movie, filled, with, famous,...","[bizarre, horror, movie, filled, with, famous,..."
2,8329,1,"A solid, if unremarkable film. Matthau, as Ein...","A solid, if unremarkable film. Matthau, as Ein...",en,"[A, solid, ,, if, unremarkable, film, ., Matth...","[a, solid, ,, if, unremarkable, film, ., matth...","[a, solid, if, unremarkable, film, matthau, as..."
3,9063,1,It's a strange feeling to sit alone in a theat...,it is a strange feeling to sit alone in a thea...,en,"[it, is, a, strange, feeling, to, sit, alone, ...","[it, is, a, strange, feeling, to, sit, alone, ...","[it, is, a, strange, feeling, to, sit, alone, ..."
4,3092,1,"You probably all already know this by now, but...","You probably all already know this by now, but...",en,"[You, probably, all, already, know, this, by, ...","[you, probably, all, already, know, this, by, ...","[you, probably, all, already, know, this, by, ..."


### Removing StopWords

In [10]:
stopwords = set(stopwords.words('english'))
review_df['no_stopwords'] = review_df['no_punc'].apply(lambda x: [word for word in x if word not in stopwords])
review_df.head()

Unnamed: 0,id,ratings,review,exp_review,langs,tokenized,lowercase,no_punc,no_stopwords
0,4715,1,For a movie that gets no respect there sure ar...,For a movie that gets no respect there sure ar...,en,"[For, a, movie, that, gets, no, respect, there...","[for, a, movie, that, gets, no, respect, there...","[for, a, movie, that, gets, no, respect, there...","[movie, gets, respect, sure, lot, memorable, q..."
1,12390,1,Bizarre horror movie filled with famous faces ...,Bizarre horror movie filled with famous faces ...,en,"[Bizarre, horror, movie, filled, with, famous,...","[bizarre, horror, movie, filled, with, famous,...","[bizarre, horror, movie, filled, with, famous,...","[bizarre, horror, movie, filled, famous, faces..."
2,8329,1,"A solid, if unremarkable film. Matthau, as Ein...","A solid, if unremarkable film. Matthau, as Ein...",en,"[A, solid, ,, if, unremarkable, film, ., Matth...","[a, solid, ,, if, unremarkable, film, ., matth...","[a, solid, if, unremarkable, film, matthau, as...","[solid, unremarkable, film, matthau, einstein,..."
3,9063,1,It's a strange feeling to sit alone in a theat...,it is a strange feeling to sit alone in a thea...,en,"[it, is, a, strange, feeling, to, sit, alone, ...","[it, is, a, strange, feeling, to, sit, alone, ...","[it, is, a, strange, feeling, to, sit, alone, ...","[strange, feeling, sit, alone, theater, occupi..."
4,3092,1,"You probably all already know this by now, but...","You probably all already know this by now, but...",en,"[You, probably, all, already, know, this, by, ...","[you, probably, all, already, know, this, by, ...","[you, probably, all, already, know, this, by, ...","[probably, already, know, 5, additional, episo..."


### Parts of Speech Tagging

In [None]:
review_df["pos_tagging"] = review_df["no_stopwords"].apply(nltk.pos_tag)
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
review_df['wordnet_pos'] = review_df['pos_tagging'].apply(lambda x: [
    (word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
review_df.head()

### Lemmatization

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
wnl = WordNetLemmatizer()
review_df["lemmatized"] = review_df["wordnet_pos"].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
review_df.head()

In [None]:
review_df.to_csv(pre_processed_data_path, index=False)