In [1]:
# import libraries
import pandas as pd
import json
import nltk
import re

In [2]:
# Load data
fileName = './bbc_news_list_uk.json'
cols = ['news_post_date', 'language', 'content']

with open(fileName, 'r') as f:
  data = json.loads(f.read())

df = pd.DataFrame(data = data, columns = cols)
print('Total data:', len(df))
df.head()

Total data: 15825


Unnamed: 0,news_post_date,language,content
0,2010-08-06T02:46:40.000Z,en_GB,The heroin substitute methadone can be used as...
1,2010-10-12T08:54:42.000Z,en_GB,The eldest son of North Korean leader Kim Jong...
2,2010-10-27T17:35:24.000Z,en_GB,Seven oil paintings created by notorious gangs...
3,2010-10-08T13:21:50.000Z,en_GB,A 20-tonne bridge is being hauled into place b...
4,2010-09-22T17:35:34.000Z,en_GB,The final words written by a guardsman killed ...


In [3]:
# Clean data
df = df.drop(df[df['news_post_date'].str[:4] != '2010'].index)
df = df.drop(df[df['language'] != 'en_GB'].index)
df = df.dropna(axis=0)
print('Total data:', len(df))

Total data: 15819


In [5]:
# Two lists  of words that are used when a man or woman is present, based on Danielle Sucher's https://github.com/DanielleSucher/Jailbreak-the-Patriarchy
male_words = set(['guy','spokesman','chairman',"men's",'men','him',"he's",'his','boy','boyfriend','boyfriends','boys','brother','brothers','dad','dads','dude','father','fathers','fiance','gentleman','gentlemen','god','grandfather','grandpa','grandson','groom','he','himself','husband','husbands','king','male','man','mr','nephew','nephews','priest','prince','son','sons','uncle','uncles','waiter','widower','widowers'])
female_words = set(['heroine','spokeswoman','chairwoman',"women's",'actress','women',"she's",'her','aunt','aunts','bride','daughter','daughters','female','fiancee','girl','girlfriend','girlfriends','girls','goddess','granddaughter','grandma','grandmother','herself','ladies','lady','lady','mom','moms','mother','mothers','mrs','ms','niece','nieces','priestess','princess','queens','she','sister','sisters','waitress','widow','widows','wife','wives','woman'])

In [6]:
# function to classify sentece into gender classes
def gender_the_sentence(sentence_words):
    mw_length = len(male_words.intersection(sentence_words))
    fw_length = len(female_words.intersection(sentence_words))

    if (mw_length > 0 and fw_length == 0):
        gender = 'male'
    elif (mw_length == 0 and fw_length > 0): 
        gender = 'female'
    elif (mw_length > 0 and fw_length > 0): 
        gender = 'both'
    else:
        gender = 'none'
    return gender

In [7]:
# function to detect and count proper nouns
def is_it_proper(word):
        if (word[0] == word[0].upper()):
            case = 'upper'
        else:
            case = 'lower'
        
        word_lower = word.lower()
        try:
            proper_nouns[word_lower][case] = proper_nouns[word_lower].get(case,0)+1
        except Exception:
            #This is triggered when the word hasn't been seen yet
            proper_nouns[word_lower] = {case:1}

In [8]:
# function to count sentences and words by gender
def increment_gender(sentence_words,gender):
    sentence_counter[gender] += 1
    word_counter[gender] += len(sentence_words)
    for word in sentence_words:
        word_freq[gender][word] = word_freq[gender].get(word,0) + 1

In [9]:
# variables declaration
sexes = ['male','female','none','both']
sentence_counter = {sex: 0 for sex in sexes}
word_counter = {sex: 0 for sex in sexes}
word_freq = {sex: {} for sex in sexes}
proper_nouns = {}

In [10]:
# Processing data

tokenizer = nltk.data.load('./nltk_data/tokenizers/punkt/english.pickle')

spill_senteces = []
for index, row in df.iterrows():

  # Fix sentence format by adding space after period
  row['content'] = row['content'].replace(".",". ")

  # Split into sentences
  sentences = tokenizer.tokenize(row['content'])
  
  for sentence in sentences:
      # word tokenize and strip punctuation
      sentence_words = re.split('\W+', sentence)
      sentence_words = list(filter(None, sentence_words))
      
      # Figure out how often each word is capitalized
      [is_it_proper(word) for word in sentence_words[1:] if len(word) > 0]

      # Lower case it
      sentence_words = set([w.lower() for w in sentence_words])
      
      # Figure out if there are gendered words in the sentence by computing the length of the intersection of the sets
      gender = gender_the_sentence(sentence_words)

      # Increment some counters
      increment_gender(sentence_words,gender)

In [11]:
# Create a set of proper nouns
proper_nouns = set([word for word in proper_nouns
                    if (proper_nouns[word].get('upper',0) / (proper_nouns[word].get('upper',0) + proper_nouns[word].get('lower',0)) > 0.50)
                  ])

In [12]:
# Create a list of 1000 most common words from male and female word dictionaries
common_words = set([w for w in sorted (word_freq['female'], key = word_freq['female'].get, reverse = True)[:1000]]
                    + [w for w in sorted (word_freq['male'], key = word_freq['male'].get, reverse = True)[:1000]])

common_words = list(common_words - male_words - female_words - proper_nouns)

In [13]:
# Percentage of how likely the word appears in a male-subjected sentence
male_percent = {
  word: (word_freq['male'].get(word, 0) / word_counter['male']) / (word_freq['female'].get(word, 0) / word_counter['female'] + word_freq['male'].get(word, 0) / word_counter['male']) for word in common_words
}

There is likely an imbalance in how many words are written about men and women. If a word is mentioned in 10 male-subjected sentences and 10 female-subject sentences, that could be taken as a sign of parity, but not if there a total of 20 female-subject (50%) sentences and 100 male-subject sentences (10%). Therefore, the formula in the cell above is used. Then the ratio of how likely the word appears in a male-subjected sentece versus female-subjected sentece can be calculated by (male_percent / (100 - male_percent))

In [14]:
# print sentence analysis result
print('%.1f%% gendered sentences' % (100 * (sentence_counter['male'] + sentence_counter['female']) / (sentence_counter['male'] + sentence_counter['female'] + sentence_counter['both'] + sentence_counter['none'])))
print('%s sentences about men.' % sentence_counter['male'])
print('%s sentences about women.' % sentence_counter['female'])
print('%.1f sentences about men for each sentence about women.' % (sentence_counter['male'] / sentence_counter['female']))

30.4% gendered sentences
56138 sentences about men.
14956 sentences about women.
3.8 sentences about men for each sentence about women.


There are more sentences about men than women, which means there are more news with the subject being men. We can further analyze this after seeing the results of the word analysis.

In [15]:
# print analysis result of 50 distincitve male and female words
header = 'Ratio\tMale\tFemale\tWord'
print('Male words')
print(header)
for word in sorted(male_percent, key = male_percent.get, reverse = True)[:50]:
    try:
        ratio = male_percent[word] / (1 - male_percent[word])
    except:
        ratio = 100
    print ('%.1f\t%02d\t%02d\t%s' % (ratio,word_freq['male'].get(word, 0),word_freq['female'].get(word, 0), word))

print('\n'*2)
print('Female words')
print(header)
for word in sorted(male_percent, key = male_percent.get, reverse = False)[:50]:
    try:
        ratio = (1 - male_percent[word]) / male_percent[word]
    except:
        ratio = 100
    print('%.1f\t%01d\t%01d\t%s' % (ratio, word_freq['male'].get(word, 0), word_freq['female'].get(word, 0), word))

Male words
Ratio	Male	Female	Word
6.6	203	08	shooting
5.4	144	07	spill
4.0	216	14	soldier
3.9	163	11	remanded
3.8	233	16	gun
3.7	324	23	army
3.3	255	20	football
3.2	535	43	shot
3.2	318	26	defence
3.0	287	25	oil
2.9	259	23	armed
2.9	179	16	laws
2.8	183	17	al
2.8	204	19	soldiers
2.8	381	36	charges
2.7	335	32	military
2.7	331	32	custody
2.7	224	22	club
2.6	223	22	economy
2.6	190	19	crisis
2.6	491	50	security
2.5	156	16	growth
2.5	181	19	leaders
2.5	539	57	prime
2.3	188	21	leadership
2.3	178	20	attempted
2.3	1249	143	arrested
2.2	153	18	chancellor
2.2	297	35	coalition
2.2	195	23	actions
2.2	169	20	ministers
2.2	574	69	chief
2.1	148	18	airport
2.1	155	19	video
2.1	224	28	deputy
2.0	322	41	executive
2.0	382	49	war
2.0	184	24	build
2.0	152	20	served
1.9	201	27	colleagues
1.9	192	26	stabbed
1.9	286	39	foreign
1.9	314	43	forces
1.9	182	25	stand
1.9	167	23	criminal
1.9	145	20	peace
1.9	304	42	spending
1.8	141	20	e
1.8	141	20	politics
1.8	218	31	remain



Female words
Ratio	Male	Female	Word
57.5	

The most distinctive male word is 'spill'. This word is most likely related to the Deepwater Horizon oil spill which is one of the largest oil spill in history that occurred in 2010. Based on this analysis, the majority of subjects in reporting on oil spills are males, whether as the cause of the spill, the person trying to solve the problem, the victim, etc.

The next 4 most distinctive male word are 'shooting', 'gun', 'soldier', and 'shot'. This illustrates that males were more related to shooting incidents and news about soldier.

The rest of the most distinctive male words are related to war, criminal, and a little bit about politics which is fairly commonly reported by international news.

The word 'football' is in the 11th rank if most distintive male words and is the only sport related word. This illustrates how the male UK citizen really like football as their sport.

In the list of most distinctive female word, we can see that the top 5 words are all related to pregnancy and baby. This makes sense because female do have more to do with it rather than male.

The rest of the most distinctive female words are generally related to health.