# Visual Analytics - Text Mining Lord of the Rings Movie Scripts

In [133]:
# Setup
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import csv

In [134]:
# load data - https://www.kaggle.com/paultimothymooney/lord-of-the-rings-data
scripts = pd.read_csv('data/lotr_scripts.csv', index_col=0)
chars = pd.read_csv('data/lotr_characters.csv')

In [135]:
scripts.head(3)

Unnamed: 0,char,dialog,movie
0,DEAGOL,"Oh Smeagol Ive got one! , Ive got a fish Smeag...",The Return of the King
1,SMEAGOL,"Pull it in! Go on, go on, go on, pull it in!",The Return of the King
2,DEAGOL,Arrghh!,The Return of the King


In [136]:
chars.head(3)

Unnamed: 0,birth,death,gender,hair,height,name,race,realm,spouse
0,,,Female,,,Adanel,Men,,Belemir
1,TA 2978,"February 26 ,3019",Male,Dark (book) Light brown (movie),,Boromir,Men,,
2,,"March ,3019",Male,,,Lagduf,Orcs,,


In [159]:
# load books.txt https://github.com/tianyigu/Lord_of_the_ring_project/tree/master/LOTR_code/lotr_script_scripy/lotr
# read textfile
lotr1 = pd.read_csv('data/book1.txt', sep='\n', header=None, encoding='utf-8', quoting=csv.QUOTE_NONE)
lotr2 = pd.read_csv('data/book2.txt', sep='\n', header=None, encoding='utf-8', quoting=csv.QUOTE_NONE)
lotr3 = pd.read_csv('data/book3.txt', sep='\n', header=None, encoding='utf-8', quoting=csv.QUOTE_NONE)

# change column name to 'text'
lotr1.columns = ['text']
lotr2.columns = ['text']
lotr3.columns = ['text']

In [160]:
# lotr1: remove title, contents, foreword and prologue
idx = lotr1[lotr1['text'].str.contains('\* BOOK I \*')].index 
lotr1 = lotr1.iloc[idx[0]:, :]

# lotr2: remove title
idx = lotr2[lotr2['text'].str.contains('\* BOOK III \*')].index 
lotr2 = lotr2.iloc[idx[0]:, :]

# lotr3: remove title
idx = lotr3[lotr3['text'].str.contains('\* BOOK V \*')].index 
lotr3 = lotr3.iloc[idx[0]:-18, :]

In [161]:
# combine books into one dataframe
lotr = pd.concat([lotr1, lotr2, lotr3]).reset_index(drop=True)

In [162]:
# Add book and chapter information
lotr['book'] = lotr['text'].str.extract(r'(\* BOOK .* \*)', expand=False)
lotr['chapter'] = lotr['text'].str.extract(r'(^Chapter .*)', expand=False)

# fill empty rows with last available value
lotr['book'] = lotr['book'].fillna(method='ffill')
lotr['chapter'] = lotr['chapter'].fillna(method='ffill')

In [163]:
lotr

Unnamed: 0,text,book,chapter
0,* BOOK I *,* BOOK I *,
1,Chapter 1 . A Long-expected Party,* BOOK I *,Chapter 1 . A Long-expected Party
2,When Mr. Bilbo Baggins of Bag End announced th...,* BOOK I *,Chapter 1 . A Long-expected Party
3,celebrating his eleventy -first birthday with ...,* BOOK I *,Chapter 1 . A Long-expected Party
4,"magnificence, there was much talk and exciteme...",* BOOK I *,Chapter 1 . A Long-expected Party
...,...,...,...
38506,"went. But Sam turned to Bywater, and so came b...",* BOOK VI *,Chapter 9 . The Grey Havens
38507,"ending once more. And he went on, and there wa...",* BOOK VI *,Chapter 9 . The Grey Havens
38508,"within; and the evening meal was ready, and he...",* BOOK VI *,Chapter 9 . The Grey Havens
38509,"him in, and set him in his chair, and put litt...",* BOOK VI *,Chapter 9 . The Grey Havens


In [172]:
# join text per chapter
lotr.groupby(['book', 'chapter'])['text'].apply(lambda x: len(' '.join(x).split()))

book         chapter                               
* BOOK I *   Chapter 1 . A Long-expected Party          9875
             Chapter 10. Strider                        5804
             Chapter 11. A Knife in the Dark            9368
             Chapter 12 . Flight to the Ford            8727
             Chapter 2 . The Shadow of the Past        11190
                                                       ...  
* BOOK VI *  Chapter 5 . The Steward and the King       7614
             Chapter 6. Many Partings                   7371
             Chapter 7 . Homeward Bound                 3989
             Chapter 8 . The Scouring of the Shire     10940
             Chapter 9 . The Grey Havens                4747
Name: text, Length: 67, dtype: int64