In [1]:
import os
import re
import pandas as pd
import seaborn as sns
import nltk
from nltk.corpus import wordnet,stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string
from bs4 import BeautifulSoup
from textblob import TextBlob
from unidecode import unidecode
import contractions

In [2]:
data_path = 'dataset/aclImdb'

In [3]:
def retrieve_review_data(type, dataset_path):
    def parse_file(category_label, file_path):
        with open(file_path, 'r', encoding="utf-8") as review_file:
            ids_and_rating = re.findall(r'\d+', os.path.basename(file_path))
            record_id = int(ids_and_rating[0])
            review_rating = int(ids_and_rating[1])
            review_content = review_file.read()
            return [record_id, review_content, category_label, review_rating]

    reviews = [
        parse_file(label, os.path.join(dataset_path, type, label, filename))
        for label in ["pos", "neg"]
        for filename in os.listdir(os.path.join(dataset_path, type, label))
    ]
    
    return pd.DataFrame(reviews, columns=["record_id", "review_content", "sentiment", "review_rating"])


In [4]:
train_data_frame = retrieve_review_data('train', data_path)

In [5]:
train_data_frame.head()

Unnamed: 0,record_id,review_content,sentiment,review_rating
0,0,Bromwell High is a cartoon comedy. It ran at t...,pos,9
1,10000,Homelessness (or Houselessness as George Carli...,pos,8
2,10001,Brilliant over-acting by Lesley Ann Warren. Be...,pos,10
3,10002,This is easily the most underrated film inn th...,pos,7
4,10003,This is not the typical Mel Brooks film. It wa...,pos,8


In [6]:
train_data_frame['label']=train_data_frame['sentiment'].apply(lambda x: 1 if x=='pos' else -1)

In [7]:
# creating a list of stop words
stop_words = stopwords.words('english')
stop_words.extend(['<*>'])
stop_words.remove('not')
# punctuations to remove if needed
remove_punctuations = string.punctuation.replace('.','')
part_of_speech_dict = {"J": wordnet.ADJ,
                        "N": wordnet.NOUN,
                        "V": wordnet.VERB,
                        "R": wordnet.ADV}

In [8]:
def lemmatize_pos_tagged_text(input_text, word_lemmatizer, pos_dict):
    
    sentence_list = nltk.sent_tokenize(input_text)
    lemmatized_sentences = []

    for sentence in sentence_list:
        sentence = sentence.lower()
        lemmatized_words = []
        
        pos_tuples = nltk.pos_tag(nltk.word_tokenize(sentence)) 

        for word_index, word in enumerate(nltk.word_tokenize(sentence)):
            nltk_word_pos = pos_tuples[word_index][1]
            wordnet_word_pos = pos_dict.get(nltk_word_pos[0].upper(), None)
            
            if wordnet_word_pos is not None:
                lemmatized_word = word_lemmatizer.lemmatize(word, wordnet_word_pos)
            else:
                lemmatized_word = word_lemmatizer.lemmatize(word)

            lemmatized_words.append(lemmatized_word)

        new_sentence = " ".join(lemmatized_words)
        lemmatized_sentences.append(new_sentence)

    return " ".join(lemmatized_sentences)
    

In [9]:
# data cleaning

def data_preprocessing(text):
    #removing HTML tags
    # text = text.apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
    
    #remove diacritics
    text = text.apply(lambda x: unidecode(x, errors="preserve"))
    
    #expand contractions
    text = text.apply(lambda x: " ".join([contractions.fix(expanded_word) for expanded_word in x.split()]))
    
    # remove numbers
    text = text.apply(lambda x: re.sub(r'\d+', '', x))

    #fix typos
    # text = text.apply(lambda x: str(TextBlob(x).correct()))

    # remove punctuations except period
    text = text.apply(lambda x: re.sub('[%s]' % re.escape(remove_punctuations), ' ' , x))
    
    # lemmatize
    word_lemmatizer = WordNetLemmatizer()
    text = text.apply(lambda entry: lemmatize_pos_tagged_text(entry, word_lemmatizer, part_of_speech_dict))

    # remove double spaces
    text = text.apply(lambda x: re.sub(' +', ' ', x))

    # remove all punctuation
    text = text.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ' , x))

    # remove stopwords
    text = text.apply(lambda x: " ".join([ word for word in x.split() if word not in stop_words]) )
    
    return text
    

In [28]:
sample_data = train_data_frame['review_content'].head(10000)

In [29]:
sample_labels = pd.concat([train_data_frame['label'].head(5), train_data_frame['label'].tail(5)])

In [40]:
cleaned_data = data_preprocessing(train_data_frame['review_content'])

In [41]:
cleaned_data

0        bromwell high cartoon comedy run time program ...
1        homelessness houselessness george carlin state...
2        brilliant act lesley ann warren best dramatic ...
3        easily underrated film inn brook cannon sure f...
4        not typical mel brook film much less slapstick...
                               ...                        
24995    towards end movie felt technical felt like cla...
24996    kind movie enemy content watch time not bloody...
24997    saw descent last night stockholm film festival...
24998    film pick pound turn rather good rd century fi...
24999    one dumb film ever see rip nearly ever type th...
Name: review_content, Length: 25000, dtype: object

In [42]:
train_data_frame['clean_text'] = cleaned_data

In [43]:
train_data_frame

Unnamed: 0,record_id,review_content,sentiment,review_rating,label,clean_text
0,0,Bromwell High is a cartoon comedy. It ran at t...,pos,9,1,bromwell high cartoon comedy run time program ...
1,10000,Homelessness (or Houselessness as George Carli...,pos,8,1,homelessness houselessness george carlin state...
2,10001,Brilliant over-acting by Lesley Ann Warren. Be...,pos,10,1,brilliant act lesley ann warren best dramatic ...
3,10002,This is easily the most underrated film inn th...,pos,7,1,easily underrated film inn brook cannon sure f...
4,10003,This is not the typical Mel Brooks film. It wa...,pos,8,1,not typical mel brook film much less slapstick...
...,...,...,...,...,...,...
24995,9998,"Towards the end of the movie, I felt it was to...",neg,4,-1,towards end movie felt technical felt like cla...
24996,9999,This is the kind of movie that my enemies cont...,neg,3,-1,kind movie enemy content watch time not bloody...
24997,999,I saw 'Descent' last night at the Stockholm Fi...,neg,3,-1,saw descent last night stockholm film festival...
24998,99,Some films that you pick up for a pound turn o...,neg,1,-1,film pick pound turn rather good rd century fi...


In [45]:
train_data_frame.to_csv("dataset/clean_data/train_clean.csv")

In [46]:
train_data_frame.head()

Unnamed: 0,record_id,review_content,sentiment,review_rating,label,clean_text
0,0,Bromwell High is a cartoon comedy. It ran at t...,pos,9,1,bromwell high cartoon comedy run time program ...
1,10000,Homelessness (or Houselessness as George Carli...,pos,8,1,homelessness houselessness george carlin state...
2,10001,Brilliant over-acting by Lesley Ann Warren. Be...,pos,10,1,brilliant act lesley ann warren best dramatic ...
3,10002,This is easily the most underrated film inn th...,pos,7,1,easily underrated film inn brook cannon sure f...
4,10003,This is not the typical Mel Brooks film. It wa...,pos,8,1,not typical mel brook film much less slapstick...


In [47]:
test_data_frame = retrieve_review_data('test', data_path)

In [55]:
test_data_frame.head()

Unnamed: 0,record_id,review_content,sentiment,review_rating
0,0,I went and saw this movie last night after bei...,pos,10
1,10000,Actor turned director Bill Paxton follows up h...,pos,7
2,10001,As a recreational golfer with some knowledge o...,pos,9
3,10002,"I saw this film in a sneak preview, and it is ...",pos,8
4,10003,Bill Paxton has taken the true story of the 19...,pos,8


In [56]:
test_cleaned_data = data_preprocessing(test_data_frame['review_content'])

In [59]:
test_cleaned_data

0        go saw movie last night coax friend mine admit...
1        actor turn director bill paxton follow promisi...
2        recreational golfer knowledge sport history pl...
3        saw film sneak preview delightful cinematograp...
4        bill paxton take true story u golf open make f...
                               ...                        
24995    occasionally let kid watch garbage understand ...
24996    anymore pretty much reality tv show people mak...
24997    basic genre thriller intercut uncomfortable me...
24998    four thing intrigue film firstly star carly po...
24999    david bryce comment nearby exceptionally well ...
Name: review_content, Length: 25000, dtype: object

In [60]:
test_data_frame['clean_text'] = test_cleaned_data

In [63]:
test_data_frame

Unnamed: 0,record_id,review_content,sentiment,review_rating,clean_text
0,0,I went and saw this movie last night after bei...,pos,10,go saw movie last night coax friend mine admit...
1,10000,Actor turned director Bill Paxton follows up h...,pos,7,actor turn director bill paxton follow promisi...
2,10001,As a recreational golfer with some knowledge o...,pos,9,recreational golfer knowledge sport history pl...
3,10002,"I saw this film in a sneak preview, and it is ...",pos,8,saw film sneak preview delightful cinematograp...
4,10003,Bill Paxton has taken the true story of the 19...,pos,8,bill paxton take true story u golf open make f...
...,...,...,...,...,...
24995,9998,I occasionally let my kids watch this garbage ...,neg,1,occasionally let kid watch garbage understand ...
24996,9999,When all we have anymore is pretty much realit...,neg,1,anymore pretty much reality tv show people mak...
24997,999,The basic genre is a thriller intercut with an...,neg,3,basic genre thriller intercut uncomfortable me...
24998,99,Four things intrigued me as to this film - fir...,neg,3,four thing intrigue film firstly star carly po...


In [64]:
test_data_frame.to_csv("dataset/clean_data/test_clean.csv")