## Preprocess data to fit model format

This notebook contains the code to preprocess the data to fit the model format. The data is preprocessed in the following steps:
1. Load the scripts and clean them
2. Load the severity ratings of for each imdb-id
3. Load the the scripts as 2 dimension tensor lists, containing the embedded sentences of a scritp
4. Merge all the data into a dataframe of the format fitting for the RNN mode (id | Aspect | None | Mild | Moderate | Severe | Total_votes | Aspect_rating | text)
5. Group the dataframes by aspect ('nudity', 'violence', 'profanity', 'frightening', 'alcohol')
6. Split the dataframes into train, validation and test sets
7. Save the datafranes as pickle files


In [1]:
import pandas as pd 
import numpy as np

In [11]:
def get_script_from_id(id):
    script = open('../data/script/' + id + '.script', 'r').read()
    # print(script)
    script = script.replace("'", " ").replace('"', ' ').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('\b', ' ').replace('\\', ' ')
    return script

In [15]:
inputFile = open('../data_gathering/baseline/output/imdb_id_with_age_rating_and_labels.txt')
df_data = []

with open("../data_gathering/baseline/output/imdb_id_with_embSentencesList50.pkl", "rb") as f:
    object = pkl.load(f)
    df_embs = pd.DataFrame(object)
    df_embs.rename(columns = {'imdb_id':'id'}, inplace=True)

for line in inputFile:
    line_data = line.strip().split(',')
    # print(line_data)
    line_data.append(int(line_data[3]) + int(line_data[4]) + int(line_data[5]) + int(line_data[6]))
    
    max_index = 0
    max_value = 0
    for i in range(3,7):
        vote_count = int(line_data[i])
        if(vote_count >= max_value):
            max_index = i - 3
            max_value = vote_count
    line_data.append(max_index)
    try:
        script = get_script_from_id(line_data[0])
    except:
        #print('Error on loading script for id: ' + line_data[0])
        continue
    line_data.append(script)
    df_data.append(line_data)

# id | Aspect | None | Mild | Moderate | Severe | Total_votes | Aspect_rating | text
df_all = pd.DataFrame(df_data, columns=['id', 'age_rating', 'Aspect', 'None', 'Mild', 'Moderate', 'Severe', 'Total_votes', 'Aspect_rating', 'text'])
df_all.drop(columns=["age_rating"], inplace=True)
df_output = pd.merge(df_all, df_embs,how='inner', on='id')
df_output.drop(columns=["text"], inplace=True)
df_output.rename(columns = {'sentences_emb':'text'}, inplace = True)
df_output.reset_index(drop=True, inplace=True)
df_output = df_output.astype({'Mild':'int', 'Moderate':'int', 'Severe':'int', 'None':'int', 'Total_votes':'int', 'Aspect_rating':'int'})
print(df_output)
print(df_output.shape)


            id       Aspect  None  Mild  Moderate  Severe  Total_votes  \
0    tt0032138       nudity    93     3         0       8          104   
1    tt0032138     violence    23    61         8       8          100   
2    tt0032138    profanity    86     4         0       5           95   
3    tt0032138      alcohol    76    10         1       6           93   
4    tt0032138  frightening    10    63        17      10          100   
..         ...          ...   ...   ...       ...     ...          ...   
250  tt0080761       nudity    19   113       145      60          337   
251  tt0080761     violence     6     8        36     147          197   
252  tt0080761    profanity     9   120        36       4          169   
253  tt0080761      alcohol    10   129        18       4          161   
254  tt0080761  frightening     6    16        45     111          178   

     Aspect_rating                                               text  
0                0  [[tensor(0.1276), t

In [18]:
import pandas as pd
import pickle

def map_data_frame_to_pickle(data_frame, file_name, path='./'):
    file_name = f'{file_name}_emb.pkl'
    with open(path + file_name, 'wb') as f:
        data_frame.to_pickle(f, protocol=4)

def split_df_into_test_and_train(aspect_name, df, path='./'):
    df_train = df.sample(frac=0.85, random_state=0)
    df_test = df.drop(df_train.index)
    map_data_frame_to_pickle(df_train, f'{aspect_name}_train', '../data/pickle/')
    map_data_frame_to_pickle(df_test, f'{aspect_name}_test', '../data/pickle/')
    map_data_frame_to_pickle(df_test, f'{aspect_name}_dev', '../data/pickle/')




In [19]:
# group the dataframe by 'aspect' and create a dictionary of dataframes
df_dict = {aspect: aspect_df.drop('Aspect', axis=1) for aspect, aspect_df in df_output.groupby('Aspect')}

# print the dictionary of dataframes
for aspect, aspect_df in df_dict.items():
    print(aspect, len(aspect_df))
    split_df_into_test_and_train(aspect, aspect_df, '../data/pickle/emb_files/')


alcohol 51
frightening 51
nudity 51
profanity 51
violence 51


In [20]:
 # Test the pickle generation

import pickle as pkl
import pandas as pd

 # map_data_frame_to_pickle(df, 'train_emb')

with open("../data/pickle/emb_files/alcohol_dev_emb.pkl", "rb") as f:
    object = pkl.load(f)
    
df = pd.DataFrame(object)
print(df)
#print(df['text_emb'][78].shape)
#df.to_csv(r'train_emb.csv')

            id  None  Mild  Moderate  Severe  Total_votes  Aspect_rating  \
3    tt0032138    76    10         1       6           93              0   
18   tt0047396    18    87         5       0          110              1   
48   tt0056869    10    53         6       0           69              1   
98   tt0066026     5    11         6       0           22              1   
198  tt0076759    79   157         5       5          246              1   
223  tt0078841     3    15         0       1           19              1   
238  tt0080339     7    68        38       9          122              1   
248  tt0080745    20     6         1       0           27              0   

                                                  text  
3    [[tensor(0.1276), tensor(-0.1368), tensor(0.79...  
18   [[tensor(-0.9443), tensor(-0.1430), tensor(0.1...  
48   [[tensor(-0.3116), tensor(-0.1049), tensor(0.4...  
98   [[tensor(-0.5252), tensor(0.4829), tensor(0.04...  
198  [[tensor(0.0440), tensor(