In [1]:
# Import libraries
import os
from pathlib import Path
from collections import Counter
import re
import random

import pandas as pd
import numpy as np

from helper import preprocessing as pre

In [2]:
# Import data and process into usable format
data_list = []

for p in Path('./data/aclImdb/train/neg').glob('*.txt'):
    with p.open() as f:
        file_name = re.split('_|\.', p.name)
        review_id = file_name[0]
        review_score = int(file_name[1])
        # append review id, review score, review text, and binary label (0 = negative)
        data_list.append([review_id, review_score, f.read(), 0])

for p in Path('./data/aclImdb/train/pos').glob('*.txt'):
    with p.open() as f:
        file_name = re.split('_|\.', p.name)
        review_id = file_name[0]
        review_score = int(file_name[1])
        # append review id, review score, review text, and binary label (1 = positive)
        data_list.append([review_id, review_score, f.read(), 1])

df = pd.DataFrame(data_list, columns = ['review_id', 'review_score', 'review_text', 'label'])

In [3]:
# Examine data
print("Shape:", df.shape)
df.head()

Shape: (25000, 4)


Unnamed: 0,review_id,review_score,review_text,label
0,1821,4,Working with one of the best Shakespeare sourc...,0
1,10402,1,"Well...tremors I, the original started off in ...",0
2,1062,4,Ouch! This one was a bit painful to sit throug...,0
3,9056,1,"I've seen some crappy movies in my life, but t...",0
4,5392,3,"""Carriers"" follows the exploits of two guys an...",0


In [4]:
# Define paths
CWD = os.getcwd()
WORDLISTS = 'wordlist'


with open(os.path.join(CWD, WORDLISTS, 'female_word_file.txt')) as file:
    f_vocab = []
    for w in file.read().split():
        f = w.replace('_', ' ')
        f_vocab.append(f)
    
with open(os.path.join(CWD, WORDLISTS, 'male_word_file.txt')) as file:
    m_vocab = []
    for w in file.read().split():
        m = w.replace('_', ' ')
        m_vocab.append(m)

In [5]:
tot_vocab = set(f_vocab + m_vocab)

In [6]:
# Note that there are duplicate words for both female and male word lists
len(tot_vocab)

409

In [7]:
# Convert the reviews to neutral text ([UNK] for gendered words)
neutral_reviews = df['review_text'].apply(pre.preprocess_unk, vocab=tot_vocab)

In [8]:
df['review_text']

0        Working with one of the best Shakespeare sourc...
1        Well...tremors I, the original started off in ...
2        Ouch! This one was a bit painful to sit throug...
3        I've seen some crappy movies in my life, but t...
4        "Carriers" follows the exploits of two guys an...
                               ...                        
24995    About a year ago I finally gave up on American...
24996    When I saw the elaborate DVD box for this and ...
24997    Last November, I had a chance to see this film...
24998    Great movie -I loved it. Great editing and use...
24999    Enchanted April is a tone poem, an impressioni...
Name: review_text, Length: 25000, dtype: object

In [9]:
neutral_reviews

0        working with one of the best shakespeare sourc...
1        well...tremors i, the original started off in ...
2        ouch! this one was a bit painful to sit throug...
3        i've seen some crappy movies in my life, but t...
4        "carriers" follows the exploits of two [UNK] a...
                               ...                        
24995    about a year ago i finally gave up on american...
24996    when i saw the elaborate dvd box for this and ...
24997    last november, i had a chance to see this film...
24998    great movie -i loved it. great editing and use...
24999    enchanted april is a tone poem, an impressioni...
Name: review_text, Length: 25000, dtype: object

In [10]:
neutral_reviews[neutral_reviews.str.contains('[UNK]')]

1        well...tremors i, the original started off in ...
2        ouch! this one was a bit painful to sit throug...
3        i've seen some crappy movies in my life, but t...
4        "carriers" follows the exploits of two [UNK] a...
5        i had been looking forward to seeing this film...
                               ...                        
24994    eglimata (= crimes) is a story about little cr...
24995    about a year ago i finally gave up on american...
24996    when i saw the elaborate dvd box for this and ...
24997    last november, i had a chance to see this film...
24998    great movie -i loved it. great editing and use...
Name: review_text, Length: 20869, dtype: object

In [11]:
# Create mappings between male and female word counterparts
f_map, m_map = pre.gender_mapping(f_vocab, m_vocab)

In [12]:
# Create separate regex objects for male and female vocabs
m_regex = re.compile('|'.join(r'\b%s\b' %s for s in map(re.escape, m_vocab)))
f_regex = re.compile('|'.join(r'\b%s\b' %s for s in map(re.escape, f_vocab)))

In [13]:
# Convert all text to female gendered words
f_reviews = df['review_text'].apply(pre.preprocess_gendered_swap,
                                       vocab_map=m_map, regex=m_regex)

In [14]:
df['review_text'][11]

'A young scientist is trying to carry on his dead father\'s work on limb regeneration.His overbearing mother has convinced him that he murdered his own father and is monitoring his progress for her own evil purposes.A young doctor uses reptilian DNA he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...Admittedly the special effects in "Severed Ties" are pretty good and grotesque,but the rest of the film is awful.The severed arm is behaving like a snake and kills few people.Big deal.The acting is mediocre and the climax is silly.3 out of 10.'

In [15]:
f_reviews[11]

'a young scientist is trying to carry on hers dead mother\'s work on limb regeneration.hers overbearing mother has convinced her that she murdered her own mother and is monitoring hers progress for her own evil purposes.a young doctor uses reptilian dna she extracts from a large creature and when hers arm is conveniently ripped off a few minutes later,she injects herself with hers formula and grows a new murderous arm...admittedly the special effects in "severed ties" are pretty good and grotesque,but the rest of the film is awful.the severed arm is behaving like a snake and kills few people.big deal.the acting is mediocre and the climax is silly.3 out of 10.'

In [16]:
# Convert all text to male gendered words
m_reviews = df['review_text'].apply(pre.preprocess_gendered_swap,
                                       vocab_map=f_map, regex=f_regex)

In [17]:
df['review_text'][11]

'A young scientist is trying to carry on his dead father\'s work on limb regeneration.His overbearing mother has convinced him that he murdered his own father and is monitoring his progress for her own evil purposes.A young doctor uses reptilian DNA he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...Admittedly the special effects in "Severed Ties" are pretty good and grotesque,but the rest of the film is awful.The severed arm is behaving like a snake and kills few people.Big deal.The acting is mediocre and the climax is silly.3 out of 10.'

In [18]:
m_reviews[11]

'a young scientist is trying to carry on his dead father\'s work on limb regeneration.his overbearing father has convinced him that he murdered his own father and is monitoring his progress for his own evil purposes.a young doctor uses reptilian dna he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...admittedly the special effects in "severed ties" are pretty good and grotesque,but the rest of the film is awful.the severed arm is behaving like a snake and kills few people.big deal.the acting is mediocre and the climax is silly.3 out of 10.'

In [19]:
df['neutral_review_text'] = neutral_reviews
df['female_review_text'] = f_reviews
df['male_review_text'] = m_reviews

In [20]:
cols = df.columns.to_list()
label_index = cols.index('label')
cols = cols[:label_index] + cols[label_index+1:] + \
        [cols[label_index]]

In [21]:
new_df = df[cols]

In [22]:
print(new_df.iloc[11]['review_text'], '\n')
print(new_df.iloc[11]['neutral_review_text'], '\n')
print(new_df.iloc[11]['female_review_text'], '\n')
print(new_df.iloc[11]['male_review_text'], '\n')

A young scientist is trying to carry on his dead father's work on limb regeneration.His overbearing mother has convinced him that he murdered his own father and is monitoring his progress for her own evil purposes.A young doctor uses reptilian DNA he extracts from a large creature and when his arm is conveniently ripped off a few minutes later,he injects himself with his formula and grows a new murderous arm...Admittedly the special effects in "Severed Ties" are pretty good and grotesque,but the rest of the film is awful.The severed arm is behaving like a snake and kills few people.Big deal.The acting is mediocre and the climax is silly.3 out of 10. 

a young scientist is trying to carry on [UNK] dead [UNK]'s work on limb regeneration.[UNK] overbearing [UNK] has convinced [UNK] that [UNK] murdered [UNK] own [UNK] and is monitoring [UNK] progress for [UNK] own evil purposes.a young doctor uses reptilian dna [UNK] extracts from a large creature and when [UNK] arm is conveniently ripped o

In [23]:
new_df.to_csv('./data/processed_train.csv')