<a href="https://colab.research.google.com/github/spatank/CIS-700/blob/master/Project/log_odds_canon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Google Drive Initialization and Imports

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/Drive', force_remount = True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/Drive


In [0]:
import os
os.chdir('Drive/My Drive/CIS-700')

In [3]:
!ls Data/

04212020
04292020
Canon
hpcanon_sr_narrative_chains_counts_NNPs.txt
hpc_raw_text.txt
hpff_raw_text_reduced.txt
hpff_raw_text.txt
hpff_sr_narrative_chains_counts_NNPs_new.txt


In [4]:
import util

import pandas as pd

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
from nltk.util import ngrams

import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from textblob import TextBlob # for sentiment analysis

import networkx as nx

import scipy.io as sio

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Extract Event Chains by Character

In [5]:
data_path = 'Data/hpcanon_sr_narrative_chains_counts_NNPs.txt'
data = util.load_json(data_path)
all_verbs = []
verbs_dict = {}
for cluster, narrative_chains in data.items(): 
  tag = nltk.tag.pos_tag([cluster])[0][1] # get cluster tag
  # filter for noun phrases
  if tag in ['NN', 'NNP', 'NNS']: 
    verb_chains = []
    # convert list of actions by the character into a Pandas DataFrame
    df = pd.DataFrame.from_records(narrative_chains)  
    # split the ['story','chapter'] column into 'story' and 'chapter' columns
    df[['story','chapter']] = pd.DataFrame(df[0].values.tolist(), index = df.index)
    # remove the now redundant ['story','chapter'] column
    df.drop(df.columns[0], axis = 1, inplace = True)
    # move 'story' and 'chapter' columns to the front for ease of viewing
    cols = list(df)
    # grab the 'chapter' column and place at the front of the DataFrame
    cols.insert(0, cols.pop(cols.index('chapter')))
    df = df.loc[:, cols]
    # now grab the 'story' column and place at the front of the DataFrame
    cols.insert(0, cols.pop(cols.index('story')))
    df = df.loc[:, cols]
    # group the DataFrame for this character by the story number
    story_groups = df.groupby(by = 'story')
    # now iterate over the grouped stories to group by chapter number
    for story_idx in story_groups.groups:
      # get the story group as a DataFrame to allow grouping by chapter
      story = story_groups.get_group(story_idx)
      # print(story)
      chapter_groups = story.groupby(by = 'chapter')
      for chapter_idx in chapter_groups.groups:
        # get the chapter group as a DataFrame 
        chapter = chapter_groups.get_group(chapter_idx)
        # print(chapter)
        chapter_action_chain = []
        for idx, row in chapter.iterrows():
          # column titled '2' corresponds to the verb, check that it exists
          # if it is present in the DataFrame, check that it is not None
          # 0-th entry corresponds to the semantic role
          if 2 in row.index:
            if row[2] is not None:
              if row[2][0] == 'B-V': 
                verb = lemmatizer.lemmatize(row[2][1], 'v') 
                chapter_action_chain.append(verb)
                all_verbs.append(verb)
        # a single action is uninteresting from a narrative chain perspective
        if len(chapter_action_chain) > 1:
          verb_chains.append(chapter_action_chain)
    verbs_dict[cluster] = verb_chains

File Path:  Data/hpcanon_sr_narrative_chains_counts_NNPs.txt


In [6]:
c = Counter(all_verbs)
print(c)

Counter({'be': 29, 'think': 11, 'say': 9, 'leave': 8, 'go': 8, 'stand': 7, 'give': 7, 'look': 6, 'get': 6, 'fell': 5, 'star': 5, 'like': 5, 'know': 4, 'rise': 4, 'felt': 4, 'make': 4, 'watch': 3, 'stride': 3, 'lay': 3, 'put': 3, 'saw': 3, 'pull': 3, 'raise': 3, 'have': 3, 'come': 3, 'see': 3, 'take': 3, 'become': 3, 'hang': 3, 'want': 2, 'press': 2, 'realize': 2, 'hide': 2, 'expect': 2, 'set': 2, 'walk': 2, 'follow': 2, 'meet': 2, 'recognize': 2, 'feel': 2, 'draw': 2, 'join': 2, 'whip': 2, 'climb': 2, 'hit': 2, 'wave': 2, 'hold': 2, 'hat': 2, 'knock': 2, 'throw': 2, 'close': 2, 'do': 2, 'twist': 2, 'care': 2, 'rumble': 2, 'force': 2, 'blink': 1, 'roll': 1, 'wake': 1, 'remember': 1, 'find': 1, 'heave': 1, 'faint': 1, 'slump': 1, 'grin': 1, 'wrench': 1, 'storm': 1, 'fume': 1, 'hear': 1, 'check': 1, 'choose': 1, 'buy': 1, 'clamber': 1, 'hurry': 1, 'tell': 1, 'unclench': 1, 'spin': 1, 'sleep': 1, 'doze': 1, 'endure': 1, 'traipse': 1, 'appreciate': 1, 'understand': 1, 'remain': 1, 'squeeze'

In [7]:
len(all_verbs)

334

In [8]:
len(np.unique(all_verbs))

175

In [0]:
# # most common verbs
# filtered_ten = [word for word, cnt in c.most_common(10)]
# filtered_ten

# least common verbs
# filtered_ten = c.most_common()[:-10:-1]
# filtered_ten

In [0]:
# new_list_verbs = [verb for verb in all_verbs if verb in filtered_ten]

In [0]:
# fig, ax = plt.subplots()
# n, bins, patches = ax.hist(new_list_verbs, bins = 'auto') 
# ax.tick_params(labelcolor = 'w', labelsize = 'large', width = 3)
# fig.tight_layout()
# # ax.set_title("Histogram of Verb Frequency")
# plt.savefig('most_common_hist.png')
# plt.show()

In [0]:
# g_global = nx.DiGraph()
# g_global.add_nodes_from(np.unique(all_verbs))

# for character, verb_chains in verbs_dict.items():
#   for verb_chain in verb_chains:
#     edges_between = list(ngrams(verb_chain, 2))
#     for edge in edges_between:
#       from_node = edge[0]
#       to_node = edge[1]
#       if from_node == to_node:
#         continue # skip a self-edge
#       if g_global.has_edge(from_node, to_node):
#         g_global[from_node][to_node]['weight'] += 1
#       else:
#         g_global.add_edge(from_node, to_node, weight = 1)

# nx.write_graphml(g_global, 'global_verb_network_ff.graphml')
# A = nx.adjacency_matrix(g_global, nodelist = np.unique(all_verbs), weight = 'weight')
# sio.savemat('global_A_large.mat', dict(A = A.todense()))

In [0]:
# all_graphs = {}

# for character, verb_chains in verbs_dict.items():
#   # get the verbs for this character
#   char_verbs = np.unique([verb for verb_chain in verb_chains for verb in verb_chain])
#   g = nx.DiGraph()
#   g.add_nodes_from(char_verbs)
#   # each value in the verbs_dict is a list of lists
#   # each sub-list is a chain of verbs 
#   for verb_chain in verb_chains:
#     edges_between = list(ngrams(verb_chain, 2))
#     for edge in edges_between:
#       from_node = edge[0]
#       to_node = edge[1]
#       if from_node == to_node:
#         continue # skip a self-edge
#       if g.has_edge(from_node, to_node):
#         g[from_node][to_node]['weight'] += 1
#       else:
#         g.add_edge(from_node, to_node, weight = 1)
#   all_graphs[character] = g

In [0]:
# draco_graph = all_graphs['Draco']
# nx.write_graphml(draco_graph, 'draco_verb_network_ff.graphml')
# nx.draw(draco_graph, with_labels = True)

In [0]:
# harry_graph = all_graphs['Harry']
# nx.write_graphml(harry_graph, 'harry_verb_network_ff.graphml')
# nx.draw(harry_graph, with_labels = True)

In [0]:
# hermione_graph = all_graphs['Hermione']
# nx.write_graphml(hermione_graph, 'hermione_verb_network_ff.graphml')
# nx.draw(hermione_graph, with_labels = True)

In [0]:
# ron_graph = all_graphs['Ron']
# nx.write_graphml(ron_graph, 'ron_verb_network_ff.graphml')
# nx.draw(ron_graph, with_labels = True)

In [0]:
# voldemort_graph = all_graphs['Voldemort']
# nx.write_graphml(voldemort_graph, 'voldemort_verb_network_ff.graphml')
# nx.draw(voldemort_graph, with_labels = True)

# Log-Odds

In [0]:
def get_sorted_log_odds(character_of_interest, n, verbs_dict):
  all_action_pairs = []
  for character, verb_chains in verbs_dict.items():
    if character == character_of_interest:
      continue
    for verb_chain in verb_chains:
      edges_between = list(ngrams(verb_chain, n))
      for edge in edges_between:
        all_action_pairs.append(edge)
  all_action_pairs = Counter(all_action_pairs)
  total_action_pairs = sum(all_action_pairs.values())

  all_COI_action_chains = verbs_dict[character_of_interest]
  all_COI_action_pairs = []
  for verb_chain in all_COI_action_chains:
    edges_between = list(ngrams(verb_chain, n))
    for edge in edges_between:
      all_COI_action_pairs.append(edge)
  all_COI_action_pairs = Counter(all_COI_action_pairs)
  total_COI_action_pairs = sum(all_COI_action_pairs.values())

  log_odds_data = {}

  for pair in all_COI_action_pairs.keys():

    count_pair_COI = all_COI_action_pairs[pair]
    log_prob_COI = np.log(count_pair_COI) - np.log(total_COI_action_pairs)

    count_pair_all = all_action_pairs[pair]
    log_prob_all = np.log(count_pair_all) - np.log(total_action_pairs)

    if log_prob_COI != 0:
      log_odds = log_prob_all - log_prob_COI
      log_odds_data[pair] = log_odds

  log_odds_tuples = [(pair, log_odds) for pair, log_odds in log_odds_data.items()]
  log_odds_tuples = sorted(log_odds_tuples, key = lambda x: x[1], reverse = True)

  return log_odds_tuples

In [20]:
character_of_interest = 'Voldemort'
n = 1 # order parameter for verb chain

log_odds = get_sorted_log_odds(character_of_interest, n, verbs_dict)

for i in range(0, 25):
  print(log_odds[i])

IndexError: ignored

In [21]:
character_of_interest = 'Hermione'
n = 1 # order parameter for verb chain

log_odds = get_sorted_log_odds(character_of_interest, n, verbs_dict)

for i in range(0, 25):
  print(log_odds[i])

(('think',), 0.08583143584978137)
(('give',), -0.06831924397747713)
(('like',), -0.4737843520856413)
(('leave',), -0.4737843520856413)
(('be',), -0.6814237168638857)
(('go',), -1.4546136050973675)
(('say',), -1.4546136050973675)
(('feel',), -1.860078713205532)
(('see',), -1.860078713205532)
(('realize',), -1.860078713205532)
(('saw',), -1.860078713205532)
(('draw',), -1.860078713205532)
(('twist',), -1.860078713205532)
(('walk',), -1.860078713205532)
(('take',), -1.860078713205532)
(('believe',), -inf)
(('do',), -inf)
(('clear',), -inf)
(('read',), -inf)
(('hop',), -inf)
(('meet',), -inf)
(('seize',), -inf)
(('glaze',), -inf)
(('hand',), -inf)
(('sit',), -inf)




In [22]:
character_of_interest = 'Ron'
n = 1 # order parameter for verb chain

log_odds = get_sorted_log_odds(character_of_interest, n, verbs_dict)

for i in range(0, 25):
  print(log_odds[i])

(('be',), 0.04255961441879563)
(('think',), -0.7841189587656721)
(('stand',), -1.3437347467010947)
(('like',), -1.3437347467010947)
(('make',), -1.6314168191528755)
(('give',), -1.8137383759468304)
(('stride',), -2.7300291078209855)
(('throw',), -2.7300291078209855)
(('come',), -2.7300291078209855)
(('become',), -2.7300291078209855)
(('unstick',), -inf)
(('stop',), -inf)
(('speak',), -inf)
(('Kreacher',), -inf)




IndexError: ignored

In [23]:
character_of_interest = 'Harry'
n = 1 # order parameter for verb chain

log_odds = get_sorted_log_odds(character_of_interest, n, verbs_dict)

for i in range(0, 25):
  print(log_odds[i])

(('give',), 1.3004784415350348)
(('get',), 0.8950133334268706)
(('be',), 0.3841877096608797)
(('raise',), 0.20186615286692522)
(('fell',), -0.08581591958485557)
(('leave',), -0.08581591958485557)
(('star',), -0.08581591958485557)
(('like',), -0.08581591958485557)
(('want',), -0.4912810276930202)
(('rise',), -0.4912810276930202)
(('realize',), -0.4912810276930202)
(('stride',), -0.4912810276930202)
(('expect',), -0.4912810276930202)
(('think',), -0.4912810276930202)
(('walk',), -0.4912810276930202)
(('make',), -0.4912810276930202)
(('recognize',), -0.4912810276930202)
(('feel',), -0.4912810276930202)
(('draw',), -0.4912810276930202)
(('saw',), -0.4912810276930202)
(('wave',), -0.4912810276930202)
(('stand',), -0.8967461358011848)
(('watch',), -1.1844282082529656)
(('lay',), -1.1844282082529656)
(('put',), -1.1844282082529656)


