In [1]:
import pandas as pd
import json
import nltk
import re

In [2]:
# Load data
fileName = './bbc_news_list_uk.json'
cols = ['news_post_date', 'language', 'content']

with open(fileName, 'r') as f:
  data = json.loads(f.read())

df = pd.DataFrame(data=data, columns=cols)
print (len(df))
df.head()

15825


Unnamed: 0,news_post_date,language,content
0,2010-08-06T02:46:40.000Z,en_GB,The heroin substitute methadone can be used as...
1,2010-10-12T08:54:42.000Z,en_GB,The eldest son of North Korean leader Kim Jong...
2,2010-10-27T17:35:24.000Z,en_GB,Seven oil paintings created by notorious gangs...
3,2010-10-08T13:21:50.000Z,en_GB,A 20-tonne bridge is being hauled into place b...
4,2010-09-22T17:35:34.000Z,en_GB,The final words written by a guardsman killed ...


In [3]:
# Clean data
df = df.drop(df[df['news_post_date'].str[:4] != '2010'].index)
df = df.drop(df[df['language'] != 'en_GB'].index)
df = df.dropna(axis=0)
print (len(df))

15819


In [4]:
# Two lists  of words that are used when a man or woman is present, based on Danielle Sucher's https://github.com/DanielleSucher/Jailbreak-the-Patriarchy
male_words=set(['guy','spokesman','chairman',"men's",'men','him',"he's",'his','boy','boyfriend','boyfriends','boys','brother','brothers','dad','dads','dude','father','fathers','fiance','gentleman','gentlemen','god','grandfather','grandpa','grandson','groom','he','himself','husband','husbands','king','male','man','mr','nephew','nephews','priest','prince','son','sons','uncle','uncles','waiter','widower','widowers'])
female_words=set(['heroine','spokeswoman','chairwoman',"women's",'actress','women',"she's",'her','aunt','aunts','bride','daughter','daughters','female','fiancee','girl','girlfriend','girlfriends','girls','goddess','granddaughter','grandma','grandmother','herself','ladies','lady','lady','mom','moms','mother','mothers','mrs','ms','niece','nieces','priestess','princess','queens','she','sister','sisters','waitress','widow','widows','wife','wives','woman'])

In [5]:
def gender_the_sentence(sentence_words):
    mw_length=len(male_words.intersection(sentence_words))
    fw_length=len(female_words.intersection(sentence_words))

    if mw_length>0 and fw_length==0:
        gender='male'
    elif mw_length==0 and fw_length>0: 
        gender='female'
    elif mw_length>0 and fw_length>0: 
        gender='both'
    else:
        gender='none'
    return gender

In [6]:
def is_it_proper(word):
        if word[0]==word[0].upper():
            case='upper'
        else:
            case='lower'
        
        word_lower=word.lower()
        try:
            proper_nouns[word_lower][case] = proper_nouns[word_lower].get(case,0)+1
        except Exception:
            #This is triggered when the word hasn't been seen yet
            proper_nouns[word_lower]= {case:1}

In [7]:
def increment_gender(sentence_words,gender):
    sentence_counter[gender]+=1
    word_counter[gender]+=len(sentence_words)
    for word in sentence_words:
        word_freq[gender][word]=word_freq[gender].get(word,0)+1

In [8]:
sexes=['male','female','none','both']
sentence_counter={sex:0 for sex in sexes}
word_counter={sex:0 for sex in sexes}
word_freq={sex:{} for sex in sexes}
proper_nouns={}

In [40]:
w = 'asd.asd  asdasd ! asd!'
re.split('\W+', w)

['asd', 'asd', 'asdasd', 'asd', '']

In [10]:
tokenizer = nltk.data.load('./nltk_data/tokenizers/punkt/english.pickle')

for index, row in df.iterrows():

  #Split into sentences
  sentences = tokenizer.tokenize(row['content'])
  
  for sentence in sentences:
      #word tokenize and strip punctuation
      sentence_words = re.split('\W+', sentence)
      
      #figure out how often each word is capitalized
      [is_it_proper(word) for word in sentence_words[1:] if len(word) > 0]

      #lower case it
      sentence_words = set([w.lower() for w in sentence_words])
      
      #Figure out if there are gendered words in the sentence by computing the length of the intersection of the sets
      gender = gender_the_sentence(sentence_words)

      #Increment some counters
      increment_gender(sentence_words,gender)

In [11]:
proper_nouns = set([word for word in proper_nouns if  
                  proper_nouns[word].get('upper',0) / 
                  (proper_nouns[word].get('upper',0) + 
                   proper_nouns[word].get('lower',0))>.50])

In [12]:
common_words = set([w for w in sorted (word_freq['female'], key = word_freq['female'].get, reverse = True)[:1000]]
                  + [w for w in sorted (word_freq['male'], key = word_freq['male'].get, reverse = True)[:1000]])

common_words = list(common_words-male_words-female_words-proper_nouns)

In [13]:
male_percent = {
  word: (word_freq['male'].get(word,0) / word_counter['male']) / (word_freq['female'].get(word,0) / word_counter['female']+word_freq['male'].get(word,0)/word_counter['male']) for word in common_words
}

In [14]:
print ('%.1f%% gendered' % (100*(sentence_counter['male']+sentence_counter['female'])/
                           (sentence_counter['male']+sentence_counter['female']+sentence_counter['both']+sentence_counter['none'])))
print ('%s sentences about men.' % sentence_counter['male'])
print ('%s sentences about women.' % sentence_counter['female'])
print ('%.1f sentences about men for each sentence about women.' % (sentence_counter['male']/sentence_counter['female']))

35.8% gendered
34417 sentences about men.
8109 sentences about women.
4.2 sentences about men for each sentence about women.


In [16]:
header ='Ratio\tMale\tFemale\tWord'
print ('Male words')
print (header)
for word in sorted (male_percent, key = male_percent.get, reverse = True)[:50]:
    try:
        ratio = male_percent[word] / (1 - male_percent[word])
    except:
        ratio = 100
    print ('%.1f\t%02d\t%02d\t%s' % (ratio,word_freq['male'].get(word,0),word_freq['female'].get(word,0),word))

print ('\n'*2)
print ('Female words')
print (header)
for word in sorted (male_percent,key=male_percent.get,reverse=False)[:50]:
    try:
        ratio=(1-male_percent[word])/male_percent[word]
    except:
        ratio=100
    print ('%.1f\t%01d\t%01d\t%s' % (ratio,word_freq['male'].get(word,0),word_freq['female'].get(word,0),word))


Male words
Ratio	Male	Female	Word
5.9	200	07	spill
4.9	238	10	shooting
3.9	247	13	gun
3.8	236	13	soldier
3.4	488	30	shot
3.4	211	13	al
2.9	1055	75	arrested
2.9	406	29	defence
2.9	374	27	army
2.9	692	50	prime
2.8	322	24	football
2.8	199	15	elections
2.7	310	24	soldiers
2.6	214	17	operations
2.5	293	24	leaders
2.5	292	24	custody
2.5	481	40	military
2.4	200	17	fighting
2.4	702	61	security
2.4	442	39	oil
2.3	234	21	value
2.3	295	27	ministers
2.2	238	22	growth
2.2	351	33	denied
2.2	329	31	armed
2.2	411	39	charges
2.2	239	23	convicted
2.1	474	46	forces
2.1	328	32	official
2.1	327	32	tax
2.1	295	29	killing
2.1	324	32	troops
2.1	575	57	accused
2.1	258	26	crisis
2.1	208	21	correspondent
2.1	316	32	bail
2.0	236	24	club
2.0	226	23	chancellor
2.0	195	20	published
2.0	438	45	foreign
2.0	271	28	suspicion
2.0	338	35	alleged
1.9	467	50	spending
1.9	504	54	war
1.9	224	24	criminal
1.9	274	30	jailed
1.9	219	24	allegations
1.9	200	22	deficit
1.9	215	24	arrest
1.9	250	28	connection



Female words
Ratio	Ma

In [17]:
all_words=[w for w in word_freq['none']]+[w for w in word_freq['both']]+[w for w in word_freq['male']]+[w for w in word_freq['female']]
all_words={w:(word_freq['male'].get(w,0)+word_freq['female'].get(w,0)+word_freq['both'].get(w,0)+word_freq['none'].get(w,0)) for w in set(all_words)}

print ('word\tMale\tFemale')
for word in sorted (all_words,key=all_words.get,reverse=True)[:100]:
    print ('%s\t%.1f%%\t%.1f%%' % (word,100*word_freq['male'].get(word,0)/sentence_counter['male'],100*word_freq['female'].get(word,0)/sentence_counter['female']))

word	Male	Female
	99.9%	99.9%
the	80.0%	75.8%
to	62.5%	59.5%
of	57.7%	51.7%
and	56.5%	54.5%
a	58.4%	54.4%
in	54.4%	48.6%
said	45.6%	40.9%
is	28.2%	26.6%
for	31.2%	29.4%
that	30.6%	26.7%
on	32.5%	27.6%
was	38.3%	33.4%
it	24.9%	24.8%
s	27.5%	25.1%
he	59.2%	0.0%
have	20.6%	19.7%
with	23.2%	20.8%
be	20.9%	18.3%
at	23.2%	21.3%
has	23.4%	20.5%
by	20.1%	16.2%
from	19.2%	18.1%
are	14.6%	14.6%
been	20.1%	17.1%
as	19.3%	16.9%
but	17.6%	15.7%
not	15.9%	14.1%
had	19.1%	16.5%
this	12.7%	12.0%
will	12.5%	11.0%
were	15.2%	13.2%
an	16.2%	13.7%
they	12.1%	12.0%
we	11.4%	11.9%
who	14.8%	14.2%
which	11.8%	9.8%
his	26.7%	0.0%
i	10.1%	12.1%
mr	27.0%	0.0%
people	9.2%	9.2%
their	8.9%	10.4%
would	12.0%	9.7%
year	11.1%	10.4%
there	8.8%	9.1%
also	10.1%	8.7%
after	11.6%	10.3%
about	10.0%	9.8%
more	7.7%	7.5%
one	8.7%	8.2%
police	10.0%	7.4%
up	7.9%	7.7%
out	8.1%	7.2%
its	6.7%	5.0%
when	8.6%	7.9%
two	8.1%	6.4%
all	6.4%	6.0%
or	6.3%	6.0%
she	0.0%	54.0%
new	6.2%	5.3%
government	6.7%	4.6%
over	6.5%	5.3%
being	6.8%	6.4