# Movie Reviews

In [1]:
import os
import numpy as np
import pandas as pd


# Paths and Variables

In [3]:
dataset_name = "movie_reviews"

In [4]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
os.makedirs(output_dir, exist_ok=True)
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

In [11]:
classes = [item.removesuffix('.zip') for item in os.listdir(input_dir) if item.endswith('.zip')]
classes = [c for c in classes if c != 'orig_data']
classes

['pos', 'neg']

In [12]:
id_col = "id"
target_col = "class"
text_col = "text"

# Read data into a DataFrame

In [16]:
## !!! decompress the "pos" and "neg" zipped folders in 'raw' folder before running this.

all_data = []
for class_ in classes:
    dir_path = os.path.join(input_dir, class_)
    files = os.listdir(dir_path)
    print(class_, len(files))
    
    for f in files: 
        file_input_path = os.path.join(input_dir, class_, f)
        
        with open(file_input_path, encoding="utf-8") as inp_f:
            text = inp_f.read()
        all_data.append([f, class_, text])

print(len(all_data))

data = pd.DataFrame(all_data, columns=[id_col, target_col, text_col])
print(data.head())

pos 1000
neg 1000
2000
                id class                                               text
0  cv839_21467.txt   pos  assume nothing . \nthe phrase is perhaps one o...
1  cv034_29647.txt   pos  plot : derek zoolander is a male model . \nhe ...
2  cv908_16009.txt   pos  i actually am a fan of the original 1961 or so...
3  cv748_12786.txt   pos  a movie that's been as highly built up as the ...
4  cv253_10077.txt   pos   " good will hunting " is two movies in one : ...


# Shuffle Data

In [17]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,class,text
1860,cv528_11669.txt,neg,here's a rarity : a children's film that attem...
353,cv692_15451.txt,pos,making a sequel to a widely beloved film is a ...
1333,cv964_5794.txt,neg,ever feel you're spending your whole life on t...
905,cv805_19601.txt,pos,after sixteen years francis ford copolla has a...
1289,cv656_25395.txt,neg,it seemed like the perfect concept . \nwhat be...


# Insert Id Column

In [18]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

# Utility to Save DF as a zipped file

In [19]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [20]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

In [21]:
data.shape

(2000, 3)

# Train Test Split

In [22]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)


data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])
print(data_train.shape, data_test.shape, data_test_key.shape)

# # Save original files as csv
# data_train.to_csv(outp_train_fname, index=False)
# data_test.to_csv(outp_test_fname, index=False)
# data_test_key.to_csv(outp_test_key_fname, index=False)


(1800, 3) (200, 2) (200, 2)


In [23]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")