# Novel Summarizer

## Imports

In [1]:
import subprocess
import pandas as pd
import numpy as np
import csv
import json
import nltk
import sys
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import search_google.api
import urllib.request as requests
from bs4 import BeautifulSoup



## Path Conf

In [2]:
path_data_index = '../data/harry_potter/index.csv'
path_book_nlp_out = '../../dependencies/book_nlp_output/'
path_bing_neg_score = '../dependencies/bing_sentiment_lexicons/negative-words-labelled.xlsx'
path_output_ch = '../output/top_close_characters.xlsx'
path_output_int = '../output/character_integrity.xlsx'
path_output_ch_rel = '../output/character_relationship.xlsx'

## Initializing

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
nltk_senti_analyzer = SentimentIntensityAnalyzer()

## Parameter Configurations

In [4]:
# Config Params DO NOT CHANGE
cur_story_index = 6 # 10 for pp, 6 for hp
n_close_chars = 3 
n_top_chars = 10
keep_pos = ['JJ','JJS','JJR','RR','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ']
window = 0
n_query_max_page = 5
n_query_per_page = 10
concord_left_margin = 5
concord_right_margin = 5
n_concord = -1

## API Conf

In [5]:
buildargs = {
  'serviceName': 'customsearch',
  'version': 'v1',
  'developerKey': 'AIzaSyBAEWHb_bLgqpmY_NY50ykZDz9JI_ZH1GQ'
}

cseargs = {
  'q': '',
  'cx': '009620542372427651480:rc6g5pqmltw',
  'num': n_query_per_page,
  'start':1
}

## Utils

In [6]:
def readJsonFile(jsonPath):
    with open(jsonPath) as f:
        data = json.load(f)
    return data

In [7]:
def removeKeysFromDict(dict_vals,keepList):
    dict_ret = {}
    for k in keepList:
        if(k in dict_vals):
            dict_ret[k] = dict_vals[k]
    return dict_ret

In [8]:
def isObject(word):
    wn_word = word.lower() + '.n.01'
    try:
        wn_lemma = wn.synset(wn_word)
    except:
        return False
    
    l_hyper = str(list(wn_lemma.closure(lambda s: s.hypernyms())))
    if('object.n.01' in l_hyper and 'living_thing.n.01' not in l_hyper):
        return True
    else:
        return False 

In [9]:
def getWordnetPos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return ''

In [10]:
def queryLink(url):
    try:
        html = requests.urlopen(url).read().decode('utf8')
    except:
        return []
    
    return nltk.word_tokenize(BeautifulSoup(html,'lxml').get_text())

In [11]:
def lemmatizeWord(word,pos_tag):
    pos_tag = getWordnetPos(pos_tag)
    if(pos_tag != ''):
        return lemmatizer.lemmatize(word,pos_tag)
    else:
        return lemmatizer.lemmatize(word)

In [12]:
def getConcordance(target_word, main_text, left_margin = concord_left_margin,\
                   right_margin = concord_right_margin,\
                   n_concord = n_concord,is_pre_process=True):     
    if(n_concord == -1):
        n_concord = np.inf

    concord_text = []
    cur_iter = 0
    while(cur_iter < n_concord):
        if(target_word in main_text):
            index = main_text.index(target_word)
        else:
            break

        concord_text += main_text[index - left_margin : index + right_margin + 1]

        if(index+1 < len(main_text)):
            main_text = main_text[index+1:]
        else:
            break

        cur_iter += 1
    if(is_pre_process):
        concord_text = preProcessConcordText(concord_text)
    return concord_text

In [13]:
def getUnqColVals(df,col_name):
    return list(set(df[col_name]))

In [14]:
def retMatchAccuracy(list1,list2,div_by_2=True):
    n = len(set(list1).intersection(set(list2)))
    if(div_by_2):
        return n/len(list2)
    return n/list(1)

In [15]:
def retMatchAccuracyForTuple(list1,list2):
    if(len(list1) != len(list2)):
        return 0
    
    if(len(list1) == 0):
        return 0
    
    n = len(list1)
    acc = 0
    hit_count = 0
    for tup1,tup2 in zip(list1,list2):
        if(len(tup1) != len(tup2)):
            return 0
        
        acc += len(set(tup1).intersection(set(tup2)))/len(tup1)
        hit_count += len(set(tup1).intersection(set(tup2)))
        
    return acc/n,hit_count

In [16]:
def exactListMatch(list1,list2):
    if(len(list1) != len(list2)):
        return 0
    
    if(len(list1) == 0):
        return 0
    
    acc = len([i for i, j in zip(list1, list2) if i == j])
    
    return acc/len(list1)

## Helper Methods

In [17]:
def initializeCharacterMatrix(df):
    character_distance_matrix = np.empty((len(df), len(df)))
    character_distance_matrix[:] = np.inf
    return character_distance_matrix

In [18]:
def createBookNLPCommand(path,story_id,isForce,isReturnPath=False):
    jsonPath = path_book_nlp_out + story_id
    tokensPath = path_book_nlp_out + story_id + '.tokens'
    
    command = './runjava novels/BookNLP -doc ../'
    command += path
    command += ' -p '
    command += jsonPath
    command += ' -tok '
    command += tokensPath
    if(isForce):
        command += ' -f'
    
    if(isReturnPath):
        return command, jsonPath +'/book.id.book', tokensPath
    else:
        return command

In [19]:
def getBookNLPTokens(path,story_id,isForce=False):
    command,jsonPath,tokensPath = createBookNLPCommand(path,story_id,isForce,True)
    p = subprocess.Popen(command, shell=True, cwd='../resources/book-nlp-master')
    retval = p.wait()
    return jsonPath[3:],tokensPath[3:],retval

In [20]:
def getCharacterIDFromIndex(indices,df,lookup_col):
    ret = []
    for i in indices:
        ret.append(df[lookup_col][i])
    return ret

In [21]:
def getCharacterDF(df,n_top_chars=None):
    df = df.loc[df.characterId != -1]
    df_ch = df.groupby('characterId')['tokenId'].apply(list).reset_index(name='tokens')
    df_ch.tokens = df_ch.tokens.apply(sorted)
    df_ch['tok_len'] = df_ch.tokens.apply(len)
    
    if(n_top_chars is not None):
        df_ch = df_ch.sort_values(by='tok_len',ascending=False).reset_index(drop=True)[:n_top_chars]
        df_ch = df_ch.sort_values(by='tok_len').reset_index(drop=True)
    else:
        df_ch = df_ch.sort_values(by='tok_len',ascending=False).reset_index(drop=True)
        
    return df_ch

In [22]:
def getCharacterDistanceMatrix(df_ch):
    character_distance_matrix = initializeCharacterMatrix(df_ch)

    for index1,token1 in enumerate(df_ch.tokens):
        for index2,token2 in enumerate(df_ch.tokens):
            if(index1 == index2):
                continue
            elif(index1 < index2):
                iter_token = token1
                cons_token = token2
            else:
                iter_token = token2
                cons_token = token1

            dist = 0
            for tok in iter_token:
                dist += np.min(abs(np.array(cons_token)-tok))
            character_distance_matrix[index1][index2] = (dist/(len(iter_token)))
            
    return character_distance_matrix

In [23]:
def getNNPCount(json_data,character_id):
    characters = json_data['characters']
    for ch in characters:
        if(ch['id'] == character_id):
            if('NNPcount' in ch):
                return ch['NNPcount']
            else:
                return 0

In [24]:
def getTopNCharacters(json_data,list_character_id,n):
    dict_nnp_count = {}
    for ch in list_character_id:
        dict_nnp_count[ch] = getNNPCount(json_data,ch)
    return sorted(list_character_id, key=dict_nnp_count.get)[-n:]

In [25]:
def getTopNCloseCharacters(df,df_ch,json_book_data,character_distance_matrix,n_top_chars=10,n_close_chars=3):
    closest_characters = {}
    for index,character_distance_vector in enumerate(character_distance_matrix):
        min_distance_characters = getCharacterIDFromIndex(np.argsort(character_distance_vector)[:n_close_chars],\
                                                          df_ch,'characterId')
        i_ch = getCharacterIDFromIndex([index],df_ch,'characterId')
        closest_characters[i_ch[0]] = min_distance_characters
        
    list_character_id = set(df.characterId)
    list_character_id.remove(-1)
    list_character_id = list(list_character_id)
    
    top_n_ch = getTopNCharacters(json_book_data,list_character_id,n_top_chars)
    
    return removeKeysFromDict(closest_characters,top_n_ch)

In [26]:
def returnIDNameDict(json_data):
    dict_char = {}
    characters = json_data['characters']
    for ch in characters:
        dict_char[ch['id']] = ch['names'][0]['n']
    return dict_char

In [27]:
def transformDictIDToName(dict_characters,dict_id_char,isOnlyKey=True):
    dict_ret = {}
    for key,val in dict_characters.items():
        if(isOnlyKey):
            dict_ret[dict_id_char[key]]  = val
        else:
            cur = []
            for ch in val:
                 cur.append(dict_id_char[ch])
            dict_ret[dict_id_char[key]] = cur
            
    return dict_ret

In [28]:
def getEquivalentSentId(df,list_token_id):
    sentId = set()
    for token_id in list_token_id:
        sentId.add(df.loc[df.tokenId == token_id].sentenceID.reset_index(drop=True)[0])
    return list(sentId)

In [29]:
def getSentTextById(df,list_sent_id,window=0,keep_pos=None,is_remove_stop_words=True,is_lemmatize=True,isStr = False):
    window_sent_id = []
    for sent_id in list_sent_id:
        window_sent_id += list(range(sent_id-window,sent_id+window+1))
        
    df_sent_window = df.loc[df.sentenceID.isin(window_sent_id)]
    
    if(keep_pos is not None and len(df_sent_window) != 0):
        df_sent_window = df_sent_window.loc[df_sent_window.pos.isin(keep_pos)]
    
    if(is_remove_stop_words and len(df_sent_window) != 0):
        df_sent_window = df_sent_window.loc[~df_sent_window.lemma.isin(stop_words)]
        
    if(is_lemmatize and len(df_sent_window) != 0):
        df_sent_window.lemma = df_sent_window.apply(lambda row: lemmatizeWord(row['lemma'], row['pos']), axis=1)
    
    if(isStr):
        return ' '.join(list(df_sent_window.lemma))
    
    return list(df_sent_window.lemma)

In [30]:
def getSpeechText(json_data,character_id,window,keep_pos):
    list_token_id = []
    characters = json_data['characters']
    for ch in characters:
        if(ch['id'] == character_id):
            if('speaking' in ch):
                for speech in ch['speaking']:
                    list_token_id.append(speech['i'])
                
    list_sent_id = getEquivalentSentId(df,list_token_id)
    return getSentTextById(df,list_sent_id,window,keep_pos)

In [31]:
def getAgentVerbs(json_data,character_id):
    list_agent_verbs = []
    characters = json_data['characters']
    for ch in characters:
        if(ch['id'] == character_id):
            if('agent' in ch):
                for agent in ch['agent']:
                    list_agent_verbs.append(lemmatizeWord(agent['w'],'V'))
    return list_agent_verbs

In [32]:
def getWindowText(df,character_id,window,keep_pos):
    df_ch = df.groupby('characterId')['sentenceID'].apply(set).reset_index(name='sentence')
    ch_sents = list(df_ch.loc[df_ch.characterId == character_id].sentence.reset_index(drop=True)[0])
    return getSentTextById(df,ch_sents,window,keep_pos)

In [33]:
def getObjectsInPossession(json_data,character_id):
    list_obj_poss_words = []
    characters = json_data['characters']
    for ch in characters:
        if(ch['id'] == character_id):
            if('poss' in ch):
                for poss in ch['poss']:
                    if(isObject(poss['w'])):
                        list_obj_poss_words.append(lemmatizeWord(poss['w'],'N'))
    return list_obj_poss_words

In [34]:
def getLinks():
    results = search_google.api.results(buildargs, cseargs)
    return results.links

In [35]:
def preProcessConcordText(concord_text):
    ret = []
    pos = nltk.pos_tag(concord_text)
    for word,tag in pos:
        if(tag in keep_pos):
            ret.append(word.lower())
    return ret

In [36]:
def webScrapeData(topic):
    book_web_scraped_data = []
    cseargs['q'] = topic

    for i in range(n_query_max_page):
        cseargs['start'] = i + 1
        links = getLinks()
        for url in links:
            book_web_scraped_data += queryLink(url)

    return book_web_scraped_data

In [37]:
def getFeatureText(json_book_data,book_web_scraped_data,character_id,dict_id_char,book_name):
    feature_text = []

#   Using Novel Data    
    feature_text += getSpeechText(json_book_data,character_id,window,keep_pos)
    feature_text += getAgentVerbs(json_book_data,character_id)
#     feature_text += getObjectsInPossession(json_book_data,character_id)
#     feature_text += getWindowText(df,character_id,window,keep_pos)
    character_name = dict_id_char[character_id]
    
#   Using Web Data
    feature_text += getConcordance(character_name, book_web_scraped_data)
    topic = character_name + '\'s Character Analysis from ' + book_name
    ch_web_data = webScrapeData(topic)
    feature_text += getConcordance(character_name, ch_web_data)
    
    return feature_text

In [38]:
def getSentimentScore(feature_text,df_neg_labelled,nnp_count):
    if(feature_text is None):
        return 0
    
    if(len(feature_text) == 0):
        return 0
    
    pos_score = 0
    neg_score = 0
    list_neg_words = list(df_neg_labelled.list_neg_text)
    hit_count = 0
    
    for word in feature_text:
        if(word in list_neg_words):
            hit_count += 1
            score = df_neg_labelled.loc[list_neg_words.index(word)].neg_polarity
            neg_score += score
            
    return (neg_score)/hit_count

In [39]:
def normalizeSentiScore(senti_score,factor):
    return senti_score * factor

In [40]:
def getLabelFromSentiScore(senti_score,threshold):
    if(senti_score > threshold):
        return 0
    else:
        return 1

In [41]:
def isCharacterExist(ch_set,ch1,ch2):
    if(ch1 in ch_set and ch2 in ch_set):
        return True
    else:
        return False

In [42]:
def getRowsWithMutipleCharacters(df):
    df_ch_co_occur = df.loc[df.characterId != -1]\
    .groupby('sentenceID')['characterId']\
    .apply(set).reset_index(name='characters')

    df_ch_co_occur['n_chars'] = df_ch_co_occur.characters.apply(len)
    return df_ch_co_occur.loc[df_ch_co_occur.n_chars != 1].reset_index(drop=True)

In [43]:
def getRowsWithTwoCharacters(df_co_occur,ch1,ch2):
    col_name = 'ch_' + str(ch1) + '_' + 'ch_' + str(ch2)
    df_ch_co_occur[col_name] = df_ch_co_occur.apply\
    (lambda row: isCharacterExist(row['characters'],ch1,ch2), axis=1)
    df_ch1_ch2 = df_ch_co_occur.loc[df_ch_co_occur[col_name] == True].reset_index(drop=True)
    return df_ch1_ch2.drop(columns=[col_name])

In [44]:
def idToName(ch_id,dict_id_char):
    return dict_id_char[ch_id]

In [45]:
def validation(df_val,df_close_chars,df_char_int,df_ch_rel):
    data_character_list = list(df_val.Character)
    model_character_list = list(df_close_chars.Character)
    acc1 = retMatchAccuracy(data_character_list,model_character_list)
    
    df_merge = pd.merge(df_val, df_close_chars, on='Character')
    zip1 = list(zip(df_merge.Close_Character_1,df_merge.Close_Character_2,df_merge.Close_Character_3))
    zip2 = list(zip(df_merge.ch1_id,df_merge.ch2_id,df_merge.ch3_id))
    acc2,n = retMatchAccuracyForTuple(zip1,zip2)
    
    df_merge = pd.merge(df_val, df_char_int, on='Character')
    acc3 = exactListMatch(df_merge.Character_intergrity,df_merge.label_senti_score)
    
    acc = 0
    for index,row in df_ch_rel.iterrows():
        flag = 0
        for i in range(n_close_chars):
            col = 'Close_Character_' + str(i+1)
            rel_col = 'Rel_Close_Character_' + str(i+1)
            df_filter = df_val.loc[df_val.Character == row.ch1_id].loc[df_val[col] == row.ch2_id]
            if(len(df_filter) != 0):
                df_filter = df_filter.reset_index(drop=True)
                true_label = df_filter.loc[0][rel_col]
                pred_label = row.label_senti_score
                if(true_label == pred_label):
                    acc += 1
                flag = 1
                break
    #     if(flag == 1):
    #         print(true_label,pred_label,row.ch1_id,row.ch2_id)

    acc4 = acc/n
    
    return acc1,acc2,acc3,acc4

## Main Logic

### Read Index CSV from Data Folder

In [46]:
df_data = pd.read_csv(path_data_index)
df_data.head()

Unnamed: 0,essay_id,topic,path,validation
0,hp_1,harry potter and the philosopher's stone,../data/harry_potter/stories/hp_1.txt,
1,hp_2,harry potter and the chamber of secrets,../data/harry_potter/stories/hp_2.txt,
2,hp_3,harry potter and the prisoner of azkaban,../data/harry_potter/stories/hp_3.txt,
3,hp_4,harry potter and the goblet of fire,../data/harry_potter/stories/hp_4.txt,
4,hp_5,harry potter and the order of the phoenix,../data/harry_potter/stories/hp_5.txt,


### Get the Title of the Book and Webscrape Data

In [47]:
book_name = df_data.topic.loc[cur_story_index]
book_web_scraped_data = %time webScrapeData(book_name)
len(book_web_scraped_data)

CPU times: user 11.3 s, sys: 553 ms, total: 11.9 s
Wall time: 36 s


377890

### Import Negative Labelled Data

In [48]:
df_neg_labelled = pd.read_excel(path_bing_neg_score,index=False)
df_neg_labelled.head()

Unnamed: 0,list_neg_text,neg_polarity
0,abnormal,1
1,abolish,1
2,abominable,3
3,abominably,3
4,abominate,3


In [49]:
df_neg_labelled.sample(n=200,replace=False).to_excel('for_kappa.xlsx',index=True)

### Execute Book-NLP

In [50]:
jsonPath,tokensPath,retval = %time getBookNLPTokens(df_data.path[cur_story_index],\
                                              df_data.essay_id[cur_story_index],True)
if(retval != 0):
    print('Error running Book-NLP... Exiting Now\nReturn Value:',retval)
    sys.exit()
    
jsonPath,tokensPath

CPU times: user 16.7 ms, sys: 52 ms, total: 68.7 ms
Wall time: 8min 1s


('../dependencies/book_nlp_output/hp_7/book.id.book',
 '../dependencies/book_nlp_output/hp_7.tokens')

### Read Book-NLP Json

In [51]:
json_book_data = %time readJsonFile(jsonPath)
dict_id_char = %time returnIDNameDict(json_book_data)

CPU times: user 23.9 ms, sys: 31.9 ms, total: 55.8 ms
Wall time: 84.6 ms
CPU times: user 159 µs, sys: 36 µs, total: 195 µs
Wall time: 201 µs


### Import Book-NLP Token File

In [52]:
df = %time pd.read_csv(tokensPath,sep='\t',engine='python',quoting=csv.QUOTE_NONE, encoding='utf-8')
df.head()

CPU times: user 5.38 s, sys: 539 ms, total: 5.92 s
Wall time: 7.75 s


Unnamed: 0,paragraphId,sentenceID,tokenId,beginOffset,endOffset,whitespaceAfter,headTokenId,originalWord,normalizedWord,lemma,pos,ner,deprel,inQuotation,characterId,supersense
0,0,0,0,0,7,S,8,Chapter,Chapter,Chapter,NNP,MISC,nn,O,-1,B-noun.group
1,0,0,1,8,11,S,0,One,One,one,CD,NUMBER,num,O,-1,O
2,0,0,2,12,15,S,6,The,The,the,DT,MISC,det,O,-1,O
3,0,0,3,16,20,S,6,Dark,Dark,dark,JJ,MISC,amod,O,-1,O
4,0,0,4,21,25,S,6,Lord,Lord,Lord,NNP,MISC,nn,O,-1,B-noun.person


### Pre-Compute Character Distance Matrix

In [53]:
df_ch = %time getCharacterDF(df)
character_distance_matrix = %time getCharacterDistanceMatrix(df_ch)

CPU times: user 45.2 ms, sys: 5.98 ms, total: 51.2 ms
Wall time: 60.4 ms
CPU times: user 3min 4s, sys: 1.22 s, total: 3min 6s
Wall time: 3min 39s


### Find Top N Characters and Top M Close Characters for each

In [54]:
dict_closest_characters = %time getTopNCloseCharacters(df,df_ch,json_book_data,\
                                                       character_distance_matrix,n_top_chars,n_close_chars)
# transformDictIDToName(dict_closest_characters,dict_id_char,False)
df_close_chars = pd.DataFrame.from_dict(dict_closest_characters)\
.transpose()\
.reset_index()\
.rename(index=str, columns={"index": "Character"})\
.rename(index=int, columns={0: 'ch1_id', 1: 'ch2_id', 2: 'ch3_id'})

df_close_chars['character_name'] = df_close_chars.apply(lambda row: idToName(row['Character'], dict_id_char), axis=1)
df_close_chars['ch1_name'] = df_close_chars.apply(lambda row: idToName(row['ch1_id'], dict_id_char), axis=1)
df_close_chars['ch2_name'] = df_close_chars.apply(lambda row: idToName(row['ch2_id'], dict_id_char), axis=1)
df_close_chars['ch3_name'] = df_close_chars.apply(lambda row: idToName(row['ch3_id'], dict_id_char), axis=1)

df_close_chars

CPU times: user 46.7 ms, sys: 6.32 ms, total: 53 ms
Wall time: 108 ms


Unnamed: 0,Character,ch1_id,ch2_id,ch3_id,character_name,ch1_name,ch2_name,ch3_name
0,124,174,25,121,Ginny,Mrs. Weasley,Hogwarts,Hagrid
1,112,27,59,78,Kreacher,Sirius,Mundungus,Regulus
2,80,25,32,100,Luna,Hogwarts,Fred,Fleur
3,121,174,124,119,Hagrid,Mrs. Weasley,Ginny,Voldemort
4,155,25,119,124,Snape,Hogwarts,Voldemort,Ginny
5,119,53,95,170,Voldemort,Harry,Dumbledore,Ron
6,95,33,170,53,Dumbledore,Hermione,Ron,Harry
7,170,33,53,95,Ron,Hermione,Harry,Dumbledore
8,33,170,53,95,Hermione,Ron,Harry,Dumbledore
9,53,33,170,95,Harry,Hermione,Ron,Dumbledore


### Find Integrity Analysis on the Top N Characters

In [55]:
ch_senti_score = {}
for ch,close_ch in dict_closest_characters.items():
    feature_text = %time getFeatureText(json_book_data,book_web_scraped_data,ch,dict_id_char,book_name)
    nnp_count = getNNPCount(json_book_data,ch)
    ch_senti_score[ch] = getSentimentScore(feature_text,df_neg_labelled,nnp_count)
    
df_char_int = pd.DataFrame(columns=['Character', 'senti_score'])
for key,val in ch_senti_score.items():
    df_char_int.loc[len(df_char_int)] = [key,val]
df_char_int['ch1_name'] = df_char_int.apply(lambda row: idToName(row['Character'], dict_id_char), axis=1)
df_char_int['norm_senti_score'] = df_char_int.apply(lambda row: normalizeSentiScore(row['senti_score'], 1.0/sum(df_char_int.senti_score)), axis=1)
df_char_int['label_senti_score'] = df_char_int.apply(lambda row: getLabelFromSentiScore(row['norm_senti_score'], 1.0/len(df_char_int.senti_score)), axis=1)
df_char_int

CPU times: user 20.4 s, sys: 865 ms, total: 21.3 s
Wall time: 56.1 s
CPU times: user 12.1 s, sys: 266 ms, total: 12.4 s
Wall time: 38 s
CPU times: user 12.4 s, sys: 236 ms, total: 12.6 s
Wall time: 38.4 s
CPU times: user 20 s, sys: 753 ms, total: 20.7 s
Wall time: 45.8 s
CPU times: user 10.2 s, sys: 629 ms, total: 10.8 s
Wall time: 27.7 s
CPU times: user 14.9 s, sys: 839 ms, total: 15.8 s
Wall time: 42.2 s
CPU times: user 15.3 s, sys: 1.13 s, total: 16.4 s
Wall time: 44.1 s
CPU times: user 11.3 s, sys: 673 ms, total: 12 s
Wall time: 51.6 s
CPU times: user 16.1 s, sys: 1.28 s, total: 17.4 s
Wall time: 1min 3s
CPU times: user 42.4 s, sys: 3.28 s, total: 45.7 s
Wall time: 1min 21s


Unnamed: 0,Character,senti_score,ch1_name,norm_senti_score,label_senti_score
0,124.0,1.276836,Ginny,0.0865,1
1,112.0,1.782123,Kreacher,0.120731,0
2,80.0,1.370968,Luna,0.092877,1
3,121.0,1.3125,Hagrid,0.088916,1
4,155.0,1.578059,Snape,0.106906,0
5,119.0,1.705411,Voldemort,0.115534,0
6,95.0,1.448845,Dumbledore,0.098153,1
7,170.0,1.495238,Ron,0.101296,0
8,33.0,1.332767,Hermione,0.090289,1
9,53.0,1.458388,Harry,0.098799,1


### Find Nature of Character Relationships

In [59]:
df_ch_co_occur = getRowsWithMutipleCharacters(df)
df_ch_rel = pd.DataFrame(columns=['ch1_id', 'ch2_id', 'senti_score'])

for ch1,close_ch in dict_closest_characters.items():
    for ch2 in close_ch:
        list_senti_score = []
        df_ch1_ch2 = getRowsWithTwoCharacters(df_ch_co_occur,ch1,ch2)
        list_sent_id = getUnqColVals(df_ch1_ch2,'sentenceID')
        for sent_id in list_sent_id:
            sentence = getSentTextById(df=df,\
                            list_sent_id=[sent_id],\
                            isStr=True,\
                            keep_pos=None,\
                            is_lemmatize=False,\
                            is_remove_stop_words=False)
            
            nltk_sent_analysis = nltk_senti_analyzer.polarity_scores(sentence)
            list_senti_score.append(nltk_sent_analysis.get('compound'))
            
        df_ch_rel.loc[len(df_ch_rel)] = [ch1,ch2,np.mean(list_senti_score)]
        
df_ch_rel['ch1_name'] = df_ch_rel.apply(lambda row: idToName(row['ch1_id'], dict_id_char), axis=1)
df_ch_rel['ch2_name'] = df_ch_rel.apply(lambda row: idToName(row['ch2_id'], dict_id_char), axis=1)
df_ch_rel['norm_senti_score'] = [(float(i)/max(df_ch_rel.senti_score)) for i in df_ch_rel.senti_score]
threshold = np.mean(df_ch_rel['norm_senti_score']) - (np.std(df_ch_rel['norm_senti_score'])/2)
df_ch_rel['label_senti_score'] = df_ch_rel.norm_senti_score.apply(lambda x: 0 if x < threshold else 1)
df_ch_rel

Unnamed: 0,ch1_id,ch2_id,senti_score,ch1_name,ch2_name,norm_senti_score,label_senti_score
0,124.0,174.0,0.02328,Ginny,Mrs. Weasley,0.034336,1
1,124.0,25.0,0.105933,Ginny,Hogwarts,0.156244,1
2,124.0,121.0,0.04214,Ginny,Hagrid,0.062153,1
3,112.0,27.0,-0.299583,Kreacher,Sirius,-0.441863,0
4,112.0,59.0,-0.163771,Kreacher,Mundungus,-0.241551,0
5,112.0,78.0,0.3674,Kreacher,Regulus,0.541888,1
6,80.0,25.0,0.678,Luna,Hogwarts,1.0,1
7,80.0,32.0,0.4019,Luna,Fred,0.592773,1
8,80.0,100.0,0.049388,Luna,Fleur,0.072843,1
9,121.0,174.0,-0.212967,Hagrid,Mrs. Weasley,-0.31411,0


### Validate the Output

In [57]:
#Validation
path_valid = df_data.loc[cur_story_index].validation
if(path_valid is not np.nan):
    df_val = pd.read_excel(path_valid)
    ac1,ac2,ac3,ac4 = validation(df_val,df_close_chars,df_char_int,df_ch_rel)
    print('Accuracy of Identifying Top N Important Characters:',ac1)
    print('Accuracy of Identifying Top k Close Relationships for Every Character:',ac2)
    print('Accuracy of Classsifying the Integrity of Every Character:',ac3)
    print('Accuracy of Classsifying the Nature of Every Relationship:',ac4)

Accuracy of Identifying Top N Important Characters: 1.0
Accuracy of Identifying Top k Close Relationships for Every Character: 0.5757575757575757
Accuracy of Classsifying the Integrity of Every Character: 0.9090909090909091
Accuracy of Classsifying the Nature of Every Relationship: 0.9473684210526315


### Export Output

In [58]:
df_close_chars.to_excel(path_output_ch,index=False)
df_char_int.to_excel(path_output_int,index=False)
df_ch_rel.to_excel(path_output_ch_rel,index=False)