## Preprocess data to fit model format

In [2]:
import pandas as pd
import numpy as np

In [3]:
def get_script_from_id(id):
    script = open('../data/script/' + id + '.script', 'r').read()
    # print(script)
    script = script.replace("'", " ").replace('"', ' ').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('\b', ' ').replace('\\', ' ')
    return script

In [5]:
inputFile = open('../data_gathering/baseline/output/imdb_id_with_age_rating_and_labels.txt')
df_data = []
for line in inputFile:
    line_data = line.strip().split(',')
    # print(line_data)
    line_data.append(int(line_data[3]) + int(line_data[4]) + int(line_data[5]) + int(line_data[6]))
    
    max_index = 0
    max_value = 0
    for i in range(3,7):
        vote_count = int(line_data[i])
        if(vote_count >= max_value):
            max_index = i - 3
            max_value = vote_count
    line_data.append(max_index)
    try:
        script = get_script_from_id(line_data[0])
    except:
        #print('Error on loading script for id: ' + line_data[0])
        continue
    line_data.append(script)
    df_data.append(line_data)

# id | Aspect | None | Mild | Moderate | Severe | Total_votes | Aspect_rating | text
df = pd.DataFrame(df_data, columns=['imdb_id', 'age_rating', 'aspect', 'votes none', 'votes mild', 'votes moderate', 'votes severe', 'total_votes', 'aspect_rating', 'text'])
df.drop(columns=["age_rating"], inplace=True)
df.reset_index(drop=True, inplace=True)
df = df.astype({'votes mild':'int', 'votes moderate':'int', 'votes severe':'int', 'votes none':'int', 'total_votes':'int', 'aspect_rating':'int'})
print(df)


        imdb_id       aspect  votes none  votes mild  votes moderate  \
0     tt0032138       nudity          93           3               0   
1     tt0032138     violence          23          61               8   
2     tt0032138    profanity          86           4               0   
3     tt0032138      alcohol          76          10               1   
4     tt0032138  frightening          10          63              17   
...         ...          ...         ...         ...             ...   
2387  tt2582846       nudity           9          68              30   
2388  tt2582846     violence          18          45               1   
2389  tt2582846    profanity           6          41              14   
2390  tt2582846      alcohol           3          46               8   
2391  tt2582846  frightening           5          17              43   

      votes severe  total_votes  aspect_rating  \
0                8          104              0   
1                8          100    

In [14]:
import pandas as pd
import pickle

def map_data_frame_to_pickle(data_frame, file_name, path='./'):
    file_name = f'{file_name}.pkl'
    with open(path + file_name, 'wb') as f:
        pickle.dump(data_frame, f)

def split_df_into_test_and_train(aspect_name, df, path='./'):
    df_train = df.sample(frac=0.85, random_state=0)
    df_test = df.drop(df_train.index)
    map_data_frame_to_pickle(df_train, f'{aspect_name}_train', '../data/pickle/')
    map_data_frame_to_pickle(df_test, f'{aspect_name}_test', '../data/pickle/')




In [15]:
# group the dataframe by 'aspect' and create a dictionary of dataframes
df_dict = {aspect: aspect_df.drop('aspect', axis=1) for aspect, aspect_df in df.groupby('aspect')}

# print the dictionary of dataframes
for aspect, aspect_df in df_dict.items():
    print(aspect, len(aspect_df))
    split_df_into_test_and_train(aspect, aspect_df, '../data/pickle/')


alcohol 478
        imdb_id  votes none  votes mild  votes moderate  votes severe  \
48    tt0056869          10          53               6             0   
128   tt0070849           2          21               8             0   
143   tt0072684           6          60               5             1   
158   tt0073440           0           8               9             0   
163   tt0073486           8         114              57             4   
...         ...         ...         ...             ...           ...   
2225  tt1403865           3          51              14             1   
2235  tt1411238           4          24               6             0   
2250  tt1446714           5          63               7             1   
2315  tt1602613           2           7              20             4   
2325  tt1637725           5           7              40            78   

      total_votes  aspect_rating  \
48             69              1   
128            31              1   
143

In [8]:
 # Test the pickle generation

import pickle as pkl
import pandas as pd

 # map_data_frame_to_pickle(df, 'train_emb')

with open("../data/pickle/alcohol_train_emb.pkl", "rb") as f:
    object = pkl.load(f)
    
df = pd.DataFrame(object)
df.to_csv(r'train_emb.csv')