In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import random

import spacy
from spacy import displacy
from spacy.pipeline import EntityRuler

import unicodedata

from matplotlib import pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Load the Datasets and Clean**

In [None]:
characters = pd.read_csv('/kaggle/input/lord-of-the-rings-data/lotr_characters.csv')
characters.info()
scripts_raw = pd.read_csv('/kaggle/input/lord-of-the-rings-data/lotr_scripts.csv', index_col=0)
scripts_raw.info()

# Build the Lists and Dictionarys

First we will find all of the unique realms from our character list. This list is not complete but it will give us a good head start.

We also normalize the text so that it is all in Latin-1, the format of our scripts.

In [None]:
gpe = []

for r in characters.realm.dropna().unique():
    temp = r.split(',')
    temp2 = [t.lower() for t in temp]
    temp3 = [unicodedata.normalize('NFKD',loc).encode('latin-1', 'ignore').decode('latin-1') for loc in temp2]
    gpe.extend(temp3)

len(gpe)

Next we will create a dictionary with all of the character races. We will be creating unique entites for each race, rather than tagging them as <code>PERSON</code>

First we will clean up the name and race columns.

In [None]:
characters = characters.dropna(subset=['race'])
characters.race = characters.race.str.split(',').str[0]

In [None]:
characters.race.unique()

In [None]:
characters.race = characters.race.replace('Orcs','Orc')
characters.race = characters.race.replace('Dwarves|Dwarven','Dwarf')
characters.race = characters.race.replace('Elves','Elf')
characters.race = characters.race.replace('Hobbits','Hobbit')
characters.race = characters.race.replace('Dragons','Dragon')

In [None]:
characters = characters.dropna(subset=['name'])
characters.name = [unicodedata.normalize('NFKD',name).encode('latin-1', 'ignore').decode('latin-1') for name in characters.name]

In [None]:
character_dict = {}
common = ['Master','Mrs.','Great']

for i, row in characters.iterrows():
    temp = row['name']
    if temp not in character_dict and len(temp)>3:
        character_dict[row['name'].lower()] = row['race'].upper()
    temp = row['name'].split(' ')[0]
    if temp not in character_dict and temp not in common and len(temp)>3:
        character_dict[temp.lower()] = row['race'].upper()
        
print(len(character_dict))

In [None]:
'mithrandir' in character_dict

# Confiure Custom Components

In [None]:
%%time
nlp = spacy.load('en_core_web_sm')
print(nlp.pipe_names)

In [None]:
realms = EntityRuler(nlp, overwrite_ents=True, phrase_matcher = 'LOWER')

for country in gpe:
    realms.add_patterns([{'label':'GPE','pattern':country}])

    
realms.name = 'realms'
nlp.add_pipe(realms)

In [None]:
races = EntityRuler(nlp, overwrite_ents = True, phrase_matcher = 'LOWER')

for name, race in character_dict.items():
    races.add_patterns([{'label':race,'pattern':name}])
    
races.name = 'races'
nlp.add_pipe(races)

In [None]:
print(nlp.pipe_names)

# Clean the Dialog
We want dialog that is longer than one word and also contains a character mention

In [None]:
%%time
scripts = pd.DataFrame()
for i, row in scripts_raw.dropna(subset=['dialog']).iterrows():
    for char in list(character_dict.keys()):
        if (char in row.dialog.lower()) and len(row.dialog)>20 and len(char)>5:
            scripts = scripts.append(row)
            break
            
scripts = scripts.reset_index(drop=True)

scripts.sample(20)

# Apply the NLP Pipeline

In [None]:
%%time

docs = []

for doc in nlp.pipe(iter(scripts.dialog)):
    docs.append(doc)
    
print(len(docs))

In [None]:
number  = random.randint(0,len(docs)-1)
#number = 20
#number = 134

print('Document Number: ', number)
print('Movie Title:     ', scripts.movie[number])
print('Character:       ', scripts.char[number])

displacy.render(docs[number], style='ent')

In [None]:
races.add_patterns([{'label':'MEN','pattern':'eowyn'},
                    {'label':'MEN','pattern':'eomer'},
                    {'label':'HOBBIT','pattern':'sam'},
                    {'label':'HOBBIT','pattern':'merry'},
                    {'label':'HOBBIT','pattern':'pippin'},
                    {'label':'MAIAR','pattern':'mithrandir'}])

In [None]:
extra_gpe = ['middle earth','mordor','minas tirith','helms deep',"helm's deep",'white tower','isengard']

for gpe in extra_gpe:
    realms.add_patterns([{'label':'GPE','pattern':gpe}])