# Movie Reviews

In [1]:
import numpy as np, pandas as pd
import os, sys
import math
import shutil
import uuid
import zipfile
import string
import random

In [2]:
# Paths and Variables

In [3]:
dataset_name = "movie_reviews"

In [4]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')

In [5]:
classes = [item for item in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, item))]
classes = [c for c in classes if c != 'orig_data']
classes

['neg', 'pos']

In [6]:
id_col = "id"
target_col = "class"
text_col = "text"

# Read data into a DataFrame

In [7]:
## !!! decompress the "pos" and "neg" zipped folders in 'raw' folder before running this.

all_data = []
for class_ in classes:
    dir_path = os.path.join(input_dir, class_)
    files = os.listdir(dir_path)
    print(class_, len(files))
    
    for f in files: 
        file_input_path = os.path.join(input_dir, class_, f)
        
        with open(file_input_path, encoding="utf-8") as inp_f:
            text = inp_f.read()
        all_data.append([f, class_, text])

print(len(all_data))

data = pd.DataFrame(all_data, columns=[id_col, target_col, text_col])
print(data.head())

neg 1000
pos 1000
2000
                id class                                               text
0  cv000_29416.txt   neg  plot : two teen couples go to a church party ,...
1  cv001_19502.txt   neg  the happy bastard's quick movie review \ndamn ...
2  cv002_17424.txt   neg  it is movies like these that make a jaded movi...
3  cv003_12683.txt   neg   " quest for camelot " is warner bros . ' firs...
4  cv004_12641.txt   neg  synopsis : a mentally unstable man undergoing ...


# Shuffle Data

In [8]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,class,text
1860,cv860_13853.txt,pos,the verdict : spine-chilling drama from horror...
353,cv353_19197.txt,neg,""" the 44 caliber killer has struck again . "" ..."
1333,cv333_8916.txt,pos,in the company of men made a splash at the sun...
905,cv905_28965.txt,neg,"in the year 2029 , captain leo davidson ( mark..."
1289,cv289_6463.txt,pos,[note that followups are directed to rec . art...


# Insert Id Column

In [9]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

# Utility to Save DF as a zipped file

In [11]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [12]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

# Train Test Split

In [13]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)


data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])
print(data_train.shape, data_test.shape, data_test_key.shape)

# # Save original files as csv
# data_train.to_csv(outp_train_fname, index=False)
# data_test.to_csv(outp_test_fname, index=False)
# data_test_key.to_csv(outp_test_key_fname, index=False)


(1800, 3) (200, 2) (200, 2)


In [14]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")