### Cleanup metadata

This notebook transforms the data after validation. It also removes duplicates and picks the best screenplay among them.

In [1]:
import pandas as pd
import os

In [2]:
METADATA_DIR = "../../data/8_screenplays/1_validation"
# read metadata before validation
bfr_valid = pd.read_csv(f'{METADATA_DIR}/validation.csv', dtype={'imdb_id': str, 'alt_id': str})

In [3]:
aftr_valid = pd.read_csv(f'{METADATA_DIR}/validated_file.csv', header=None, dtype={0: str, 3: str})

In [4]:
bfr_valid.head()

Unnamed: 0,imdb_id,title,filename,match,alt_id,script_url,notes
0,147800,10 Things I Hate About You,10-Things-I-Hate-About-You,0,147800,https://imsdb.com/scripts/10-Things-I-Hate-Abo...,
1,417385,12 and Holding,12-and-Holding,0,417385,https://imsdb.com/scripts/12-and-Holding.html,
2,3148266,12 Monkeys,12-Monkeys,0,3148266,https://imsdb.com/scripts/12-Monkeys.html,
3,2024544,12 Years a Slave,12-Years-a-Slave,0,2024544,https://imsdb.com/scripts/12-Years-a-Slave.html,
4,1542344,127 Hours,127-Hours,0,1542344,https://imsdb.com/scripts/127-Hours.html,


In [5]:
aftr_valid.columns = ['imdb_id', 'title', 'match', 'alt_id', 'script_url', 'notes']

In [6]:
aftr_valid.shape, bfr_valid.shape

((1653, 6), (1653, 7))

In [7]:
aftr_valid['filename'] = bfr_valid['filename']

In [8]:
aftr_valid.head()

Unnamed: 0,imdb_id,title,match,alt_id,script_url,notes,filename
0,147800,10 Things I Hate About You,1,,https://imsdb.com/scripts/10-Things-I-Hate-Abo...,,10-Things-I-Hate-About-You
1,417385,12 and Holding,1,,https://imsdb.com/scripts/12-and-Holding.html,,12-and-Holding
2,3148266,12 Monkeys,0,114746.0,https://imsdb.com/scripts/12-Monkeys.html,Given id is tv show,12-Monkeys
3,2024544,12 Years a Slave,1,,https://imsdb.com/scripts/12-Years-a-Slave.html,,12-Years-a-Slave
4,1542344,127 Hours,1,,https://imsdb.com/scripts/127-Hours.html,,127-Hours


In [9]:
aftr_valid['id_merged'] = aftr_valid['alt_id'].fillna(aftr_valid['imdb_id'])

In [10]:
aftr_valid.head()

Unnamed: 0,imdb_id,title,match,alt_id,script_url,notes,filename,id_merged
0,147800,10 Things I Hate About You,1,,https://imsdb.com/scripts/10-Things-I-Hate-Abo...,,10-Things-I-Hate-About-You,147800
1,417385,12 and Holding,1,,https://imsdb.com/scripts/12-and-Holding.html,,12-and-Holding,417385
2,3148266,12 Monkeys,0,114746.0,https://imsdb.com/scripts/12-Monkeys.html,Given id is tv show,12-Monkeys,114746
3,2024544,12 Years a Slave,1,,https://imsdb.com/scripts/12-Years-a-Slave.html,,12-Years-a-Slave,2024544
4,1542344,127 Hours,1,,https://imsdb.com/scripts/127-Hours.html,,127-Hours,1542344


We have to remove scripts for which we couldn't find alternate ids. Like unproduced scripts.

In [11]:
clean_1 = aftr_valid[~((aftr_valid['match'] == 0) & (aftr_valid['alt_id'].isnull()))]

In [12]:
clean_1.shape

(1614, 8)

Now, we'll look into duplicates.

Duplicates could arise from having the same script from multiple sources. Also, if initial matching was wrong and two same scripts were identified as different, now, they'll be duplicates after the validation.

In [13]:
dups = clean_1[clean_1['id_merged'].duplicated()]

To choose between the duplicates, we'll see the characters info that was parsed with the script.

In [14]:
clean_1['char_fname'] = clean_1['filename'] + '_charinfo.txt'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_1['char_fname'] = clean_1['filename'] + '_charinfo.txt'


In [20]:
BASE_DIR = '../../data/8_screenplays/parsed_screenplays/parsed/'
movie_chars = os.listdir(os.path.join(BASE_DIR, 'charinfo'))
rows = []

for item in movie_chars:
  fname = item
  if len(clean_1.loc[clean_1['char_fname'] == item, 'imdb_id']) == 0: continue
  imdb = clean_1.loc[clean_1['char_fname'] == item, 'imdb_id'].values[0]
  alt_id = clean_1.loc[clean_1['char_fname'] == item, 'id_merged'].values[0]

  file_path = os.path.join(BASE_DIR, 'charinfo', item)
  with open(file_path, 'r', encoding='utf-8') as f:
    data = f.read()
    if data == '': print(file_path, data)
    for line in data.split('\n'):
      if line == '': continue
      item = {}
      # char = re.split(":(?= [\d]+)", line)
      index = line.rfind(":")
      # print(char)
      item['imdb'] = imdb
      item['alt_id'] = alt_id
      item['char'] = line[:index].strip()
      item['utterances'] = line[index+1:].strip()
      item['fname'] = fname

      rows.append(item)

../../data/8_screenplays/parsed_screenplays/parsed/charinfo\Nightbreed_charinfo.txt 
../../data/8_screenplays/parsed_screenplays/parsed/charinfo\Sexual-Life_charinfo.txt 
../../data/8_screenplays/parsed_screenplays/parsed/charinfo\Space-Ball_charinfo.txt 
../../data/8_screenplays/parsed_screenplays/parsed/charinfo\Spaceballs_charinfo.txt 


In [21]:
df_temp = pd.DataFrame(rows)
df_temp['utterances'] = pd.to_numeric(df_temp['utterances'], errors='coerce')

In [22]:
df_temp.shape

(86436, 5)

In [23]:
df_temp.head()

Unnamed: 0,imdb,alt_id,char,utterances,fname
0,147800,147800,BARTENDER,2,10-Things-I-Hate-About-You_charinfo.txt
1,147800,147800,BIANCA,131,10-Things-I-Hate-About-You_charinfo.txt
2,147800,147800,BIANCA AND WALTER,1,10-Things-I-Hate-About-You_charinfo.txt
3,147800,147800,BOGEY,1,10-Things-I-Hate-About-You_charinfo.txt
4,147800,147800,BOY,2,10-Things-I-Hate-About-You_charinfo.txt


Now, we'll check for matching between two duplicate scripts.

In [24]:
df_selected = pd.DataFrame()

for index, row in dups.iterrows():
  id = row['id_merged']

  movies = clean_1.loc[clean_1['id_merged'] == id]
  max = -1
  maxfile = ""

  for each in movies['filename']:
    fname = each + '_charinfo.txt'

    chars = df_temp.loc[df_temp['fname'] == fname]

    # we choose the duplicate that has the most characters as a metric
    if len(chars['char']) > max:
      max = len(chars['char'])
      maxfile = each

  df_selected = pd.concat([df_selected, movies.loc[movies['filename'] == maxfile]])
  # print(maxfile, max)

If there were more than one duplicates in dups dataframe for a particular movie, we'll compare among the duplicates each time we encounter it. Therefore, we'll have to again remove duplicates from the resulting dataframe as we may have compared the same movie multiple times. Example, 8MM has 3 scripts. 2 of them are in the dups file. Each time we encounter the duplicate in dups, we'll compare and select one among the three scripts, resulting in 2 selections.

In [25]:
df_selected[df_selected['id_merged'].duplicated()]

Unnamed: 0,imdb_id,title,match,alt_id,script_url,notes,filename,id_merged,char_fname
17,134273,8MM,1,,https://imsdb.com/scripts/8MM.html,,8MM,134273,8MM_charinfo.txt
931,53559,13 Ghosts,0,245674.0,https://www.dailyscript.com/scripts/thirteen_g...,Older movie preferred,13-Ghosts,245674,13-Ghosts_charinfo.txt
1136,111686,New Nightmare,1,,https://www.dailyscript.com/scripts/WesCravens...,,New-Nightmare,111686,New-Nightmare_charinfo.txt
564,70379,Mean Streets,1,,https://imsdb.com/scripts/Mean-Streets.html,,Mean-Streets,70379,Mean-Streets_charinfo.txt
170,112573,Braveheart,1,,https://imsdb.com/scripts/Braveheart.html,,Braveheart,112573,Braveheart_charinfo.txt
354,104348,Glengarry Glen Ross,1,,https://imsdb.com/scripts/Glengarry-Glen-Gross...,,Glengarry-Glen-Gross,104348,Glengarry-Glen-Gross_charinfo.txt


Now, we'll first remove all duplicates, then add back the selected ones among the duplicates.

In [26]:
clean_2 = clean_1.drop_duplicates(subset=['id_merged'], keep=False)

In [27]:
clean_2.shape

(1380, 9)

In [28]:
df_selected_clean = df_selected.drop_duplicates(subset=['id_merged'])

In [29]:
clean_2 = pd.concat([clean_2, df_selected_clean])

In [30]:
clean_2

Unnamed: 0,imdb_id,title,match,alt_id,script_url,notes,filename,id_merged,char_fname
1,0417385,12 and Holding,1,,https://imsdb.com/scripts/12-and-Holding.html,,12-and-Holding,0417385,12-and-Holding_charinfo.txt
3,2024544,12 Years a Slave,1,,https://imsdb.com/scripts/12-Years-a-Slave.html,,12-Years-a-Slave,2024544,12-Years-a-Slave_charinfo.txt
4,1542344,127 Hours,1,,https://imsdb.com/scripts/127-Hours.html,,127-Hours,1542344,127-Hours_charinfo.txt
6,0179626,15 Minutes,1,,https://imsdb.com/scripts/15-Minutes.html,,15-Minutes,0179626,15-Minutes_charinfo.txt
7,0974661,17 Again,1,,https://imsdb.com/scripts/17-Again.html,,17-Again,0974661,17-Again_charinfo.txt
...,...,...,...,...,...,...,...,...,...
1287,0073802,Three Days of the Condor,1,,http://www.awesomefilm.com/script/ThreeDaysoft...,,Three-Days-of-the-Condo,0073802,Three-Days-of-the-Condo_charinfo.txt
932,4385888,20th Century Women,1,,https://www.dailyscript.com/scripts/20TH_CENTU...,,20th-Century-Woman,4385888,20th-Century-Woman_charinfo.txt
1035,4882376,First They Killed My Father,1,,https://www.dailyscript.com/scripts/FTKMF.pdf,,FTKMF-(First-They-Killed-My-Father,4882376,FTKMF-(First-They-Killed-My-Father_charinfo.txt
1566,7440732,The Kids Are Alright,0,0842926,https://s3-us-west-2.amazonaws.com/screenplays...,Newer tv show preferred,Kids-Are-Alright-The,0842926,Kids-Are-Alright-The_charinfo.txt


In [31]:
clean_2[clean_2['id_merged'].duplicated()]

Unnamed: 0,imdb_id,title,match,alt_id,script_url,notes,filename,id_merged,char_fname


In [32]:
clean_2.to_csv(f'{METADATA_DIR}/clean_validated.csv', index=False)