# Movie Reviews

In [66]:
import numpy as np, pandas as pd
import os, sys
import math
import shutil
import uuid
import zipfile
import string
import random
import json
import json

In [67]:
# Paths and Variables

In [68]:
dataset_name = "movie_reviews"

In [69]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

In [70]:
# !!! Make sure the zipped files are uncompressed before running this cell !!!

classes = [item for item in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, item))]
classes = [c for c in classes if c != 'orig_data']
classes

['neg', 'pos']

In [71]:
id_col = "id"
target_col = "class"
text_col = "text"

# Read data into a DataFrame

In [72]:
## !!! decompress the "pos" and "neg" zipped folders in 'raw' folder before running this.

all_data = []
for class_ in classes:
    dir_path = os.path.join(input_dir, class_)
    files = os.listdir(dir_path)
    print(class_, len(files))
    
    for f in files: 
        file_input_path = os.path.join(input_dir, class_, f)
        
        with open(file_input_path, encoding="utf-8") as inp_f:
            text = inp_f.read()
        all_data.append([f, class_, text])

print(len(all_data))

data = pd.DataFrame(all_data, columns=[id_col, target_col, text_col])
print(data.head())

neg 1000
pos 1000
2000
                id class                                               text
0  cv000_29416.txt   neg  plot : two teen couples go to a church party ,...
1  cv001_19502.txt   neg  the happy bastard's quick movie review \ndamn ...
2  cv002_17424.txt   neg  it is movies like these that make a jaded movi...
3  cv003_12683.txt   neg   " quest for camelot " is warner bros . ' firs...
4  cv004_12641.txt   neg  synopsis : a mentally unstable man undergoing ...


# Shuffle Data

In [73]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,class,text
1860,cv860_13853.txt,pos,the verdict : spine-chilling drama from horror...
353,cv353_19197.txt,neg,""" the 44 caliber killer has struck again . "" ..."
1333,cv333_8916.txt,pos,in the company of men made a splash at the sun...
905,cv905_28965.txt,neg,"in the year 2029 , captain leo davidson ( mark..."
1289,cv289_6463.txt,pos,[note that followups are directed to rec . art...


# Insert Id Column

In [74]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

# Utility to Save DF as a zipped file

In [75]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [76]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

In [77]:
data.shape

(2000, 3)

# Train Test Split

In [78]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)


data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])
print(data_train.shape, data_test.shape, data_test_key.shape)

# # Save original files as csv
# data_train.to_csv(outp_train_fname, index=False)
# data_test.to_csv(outp_test_fname, index=False)
# data_test_key.to_csv(outp_test_key_fname, index=False)


(1800, 3) (200, 2) (200, 2)


In [79]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")

# JSON inference request instance

In [80]:
instance = data_test.reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
print(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'id': 'cv522_5583.txt', 'text': 'of circumcision , psychic wounds and the family sitcom \nthe opening segment is something of a foretaste of this film . \nthere\'s a guy\'s voice telling us how he tries to imagine what his biological parents look like , and on the screen we see images of a variety of oldish men and women . \nas his imagination plays o n , he ( and we ) picture these motley characters in a mix-and-match shuffle of unlikely marriages , businesswomen with bums , matrons with paint salesmen , the images coming on in a faster and faster frenzy . \nit\'s not just a witty , funny summing up of the film \'s themes - you might not guess it yet , but it sets the pattern for the way the story goes on . \nwhat starts out as a step in a fairly sensible direction , gets taken on a road trip , put through a detour or two and finds itself freewheeling towards an immin ent crash . \nthat may not sound so different from the average family sitcom , and the general idea of