<a href="https://colab.research.google.com/github/userName/Salience-Prediction/blob/main/get_stats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Get various statistics on complete InfoPop data

In [2]:
%%capture

import nltk
import json
import numpy as np
from tqdm import tqdm

# Fetching resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

valid_POS = ['FW', 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS',
             'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'UNK']
be_verbs = ['be', 'is', 'was', 'were']

def get_content_ratio(sentence):
    tokenized_set = nltk.word_tokenize(sentence)
    tagged_map = nltk.pos_tag(tokenized_set)

    count_content = 0
    count_total = len(tokenized_set)

    for elem in tagged_map:
      checker = elem[1]
      if checker in valid_POS and checker != 'POS' and elem[0] not in be_verbs:
          count_content += 1
          
    ratio = count_content / count_total
    return ratio

In [3]:
data_dir = 'drive/My Drive/store/InfoPop'

In [4]:
splits = ['train', 'val', 'test']

data = []

for split in splits:
  with open(data_dir + '/' + split + '.json', 'r+') as f:
    temp_data = json.load(f)
    data += temp_data

In [5]:
print('Number of datapoints: ' + str(len(data)))

Number of datapoints: 51770


In [6]:
sentence_counts = []
for unit in data:
  count = len(unit['sent_labels'])
  sentence_counts.append(count)

with open('sent_counts.json', 'w+') as f:
  json.dump(sentence_counts, f)

freqs = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# Class frequencies
for item in sentence_counts:
	if item <= 10:
		freqs[0] += 1
	if item > 10 and item <= 20:
		freqs[1] += 1
	if item > 20 and item <= 30:
		freqs[2] += 1
	if item > 30 and item <= 40:
		freqs[3] += 1
	if item > 40 and item <= 50:
		freqs[4] += 1
	if item > 50 and item <= 60:
		freqs[5] += 1
	if item > 60 and item <= 70:
		freqs[6] += 1
	if item > 70 and item <= 80:
		freqs[7] += 1
	if item > 80 and item <= 90:
		freqs[8] += 1
	if item > 90 and item <= 100:
		freqs[9] += 1

print('Class frequencies:')
print(freqs)

sentence_counts = np.asarray(sentence_counts)
print()
print('Sum of class frequencies: ' + str(sum(freqs)))

Class frequencies:
[3103, 11822, 12110, 10041, 6267, 3675, 2096, 1272, 839, 545]

Sum of class frequencies: 51770


In [7]:
print('Statistics on number of sentences in each article:')
print('- Minimum: ' + str(np.min(sentence_counts)))
print('- Maxmimum: ' + str(np.max(sentence_counts)))
print('- Average: ' + str(np.mean(sentence_counts)))
print()
print('Total number of sentences within the dataset: ' + str(np.sum(sentence_counts)))

Statistics on number of sentences in each article:
- Minimum: 3
- Maxmimum: 100
- Average: 33.06722039791385

Total number of sentences within the dataset: 1711890


In [8]:
sent_token_counts = []
sent_scores_list = []
sent_lexical_densities = []

document_token_counts = []

for unit in tqdm(data):
  doc_count = 0
  for sentence_unit in unit['sent_labels']:
    sentence = sentence_unit[0]
    sent_score = sentence_unit[1]
    content_ratio = get_content_ratio(sentence)
    
    token_count = len(nltk.word_tokenize(sentence))
    doc_count += token_count
    
    sent_token_counts.append(token_count)
    sent_scores_list.append(sent_score)
    sent_lexical_densities.append(content_ratio)
    
  document_token_counts.append(doc_count)

100%|██████████| 51770/51770 [38:54<00:00, 22.18it/s]


In [9]:
# Saving datapoints

with open('scores.json', 'w+') as f:
  json.dump(sent_scores_list, f)

with open('lengths.json', 'w+') as f:
  json.dump(sent_token_counts, f)

with open('lex_den.json', 'w+') as f:
  json.dump(sent_lexical_densities, f)

In [10]:
# Sanity checks

def div(a, b):
  div_list = []
  for index, item in enumerate(a):
    div_list.append(item / b[index])
  
  return div_list

assert len(sent_scores_list) == len(sent_token_counts)
assert len(sent_scores_list) == len(sent_lexical_densities)

print('Correlation coefficient between: ')
print('- Scores and sentence lengths: ' + str(np.corrcoef(sent_scores_list, sent_token_counts)[0, 1]))
print('- Normalized scores and sentence lengths: ' + str(np.corrcoef( sent_token_counts, div(sent_scores_list, sent_token_counts))[0, 1]))
print('- Scores and sentence lexical densities: ' + str(np.corrcoef(sent_scores_list, sent_lexical_densities)[0, 1]))

Correlation coefficient between: 
- Scores and sentence lengths: 0.1680641497724356
- Normalized scores and sentence lengths: -0.04397969664402135
- Scores and sentence lexical densities: 0.059994910660698175


In [11]:
# Stats on tokens within sentences
print('Statistics on number of tokens in each sentence:')
print('- Minimum: ' + str(np.min(sent_token_counts)))
print('- Maxmimum: ' + str(np.max(sent_token_counts)))
print('- Average: ' + str(np.mean(sent_token_counts)))
print()
print('Total number of tokens within the dataset: ' + str(np.sum(sent_token_counts)))

Statistics on number of tokens in each sentence:
- Minimum: 1
- Maxmimum: 40
- Average: 18.23310025761001

Total number of tokens within the dataset: 31213062


In [12]:
# Stats on tokens within documents
print('Statistics on number of tokens in each document:')
print('- Minimum: ' + str(np.min(document_token_counts)))
print('- Maxmimum: ' + str(np.max(document_token_counts)))
print('- Average: ' + str(np.mean(document_token_counts)))
print()
print('Total number of tokens within the dataset: ' + str(np.sum(document_token_counts)))

Statistics on number of tokens in each document:
- Minimum: 15
- Maxmimum: 2516
- Average: 602.91794475565

Total number of tokens within the dataset: 31213062


In [13]:
# Getting stats on the various sourcing websites
urls = []

for unit in data:
  urls.append(unit['url'])

primary_sites = ['hindustantimes', 'timesnownews', 'theguardian', 'techcrunch', 'livemint', 'crictracker', 'phys',
                 'inshorts', 'cnn', 'nytimes', 'huffingtonpost', 'foxnews', 'reuters', 'usatoday', 'npr', 'latimes',
                 'nbcnews', 'cbsnews', 'nypost', 'nydailynews', 'abcnews.go', 'newsweek', 'denverpost',
                 'washington.cbslocal', 'sanfrancisco.cbslocal', 'chicagotribune', 'cbslocal']

primary_sites.sort(key = len, reverse = True)
# To ensure that substrings of larger string units appear after the larger string

freq_count = {}

for site in primary_sites:
  freq_count[site] = 0

unfits = []
multiple_matches = []

for url in urls:
  flag = 0
  for site in primary_sites:
    if site in url:
      if flag == 1:
        if len(multiple_matches) == 0:
          multiple_matches.append(url)
        if len(multiple_matches) > 0 and multiple_matches[-1] != url:
          multiple_matches.append(url)
      else:
        flag = 1
        freq_count[site] += 1
  if flag == 0:
    unfits.append(url)

print('Number of URLs having multiple matches: ' + str(len(multiple_matches)))
print('Number of URLs that do not fit it into any bucket: ' + str(len(unfits)))

Number of URLs having multiple matches: 319
Number of URLs that do not fit it into any bucket: 0


In [14]:
print('Number of articles sourced from each website: ')

website_article_counts = []
zero_maps = []

for key in freq_count:
  if freq_count[key] == 0:
    zero_maps.append(key)
    continue
  print('- ' + key + ': ' + str(freq_count[key]))
  website_article_counts.append(freq_count[key])

print()
print('Number of website(s) from which no URL was sourced: ' + str(len(zero_maps)))
print('Website(s) from which no URL was sourced: ' + str(zero_maps))
print()

website_article_counts = np.asarray(website_article_counts)
print('Number of websites from which URLs were sourced: ' + str(len(website_article_counts)))
print('Average number of URLs sourced from a single website: ' + str(np.mean(website_article_counts)))
print('Sanity check ~ Total number of sourced URLs: ' + str(np.sum(website_article_counts)))

Number of articles sourced from each website: 
- sanfrancisco.cbslocal: 146
- washington.cbslocal: 40
- hindustantimes: 371
- huffingtonpost: 75
- chicagotribune: 1038
- timesnownews: 189
- theguardian: 142
- crictracker: 44
- nydailynews: 1453
- techcrunch: 1064
- abcnews.go: 1490
- denverpost: 897
- livemint: 411
- usatoday: 6570
- newsweek: 1806
- cbslocal: 2522
- nytimes: 10677
- foxnews: 2388
- reuters: 2616
- latimes: 1917
- nbcnews: 2346
- cbsnews: 4979
- nypost: 2509
- phys: 1190
- cnn: 1638
- npr: 3252

Number of website(s) from which no URL was sourced: 1
Website(s) from which no URL was sourced: ['inshorts']

Number of websites from which URLs were sourced: 26
Average number of URLs sourced from a single website: 1991.1538461538462
Sanity check ~ Total number of sourced URLs: 51770


In [15]:
# Thank You ^_^