In [None]:
## Importing relevant libraries

import pandas as pd
import numpy as np

## Stop warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
## Loading the data

df = pd.read_csv(r'data\articles_sentiment.csv')
print(df.shape)
df.head()

(142426, 7)


Unnamed: 0,title,content,author,publication,content_lemmatized,sentiment,compound_score
0,house republicans fret winning health care sui...,washington congressional republicans new fear ...,Carl Hulse,New York Times,washington congressional republican new fear c...,positive,0.6497
1,rift officers residents killings persist south...,bullet shells get counted blood dries votive c...,Benjamin Mueller and Al Baker,New York Times,bullet shell get counted blood dry votive cand...,negative,-0.9999
2,tyrus wong bambi artist thwarted racial bias d...,walt disney bambi opened critics praised spare...,Margalit Fox,New York Times,walt disney bambi opened critic praised spare ...,positive,0.9888
3,among deaths heavy toll pop music new york times,death may great equalizer necessarily evenhand...,William McDonald,New York Times,death may great equalizer necessarily evenhand...,negative,-0.8609
4,kim jong un says north korea preparing test lo...,seoul south korea north korea leader kim said ...,Choe Sang-Hun,New York Times,seoul south korea north korea leader kim said ...,positive,0.9789


In [3]:
## Checking for gendered words in the content column

content = df['content']

print('Occurences of words:')
print('he: ', content.str.contains(' he ').sum())
print('she: ', content.str.contains(' she ').sum())
print('his: ', content.str.contains(' his ').sum())
print('her: ', content.str.contains(' her ').sum())
print('mr: ', content.str.contains(' mr ' or ' mr. ').sum())
print('mrs: ', content.str.contains(' mrs ' or ' mrs. ').sum())

Occurences of words:
he:  105595
she:  52130
his:  95808
her:  49116
mr:  13503
mrs:  2404


In [4]:
## Listing gendered words

gender_keywords = {'male': [' he ', ' his ', ' him ', ' himself ', ' hes ', ' man ', ' mr. ', ' mr ', ' men '],
                   'female': [' she ', ' her ', ' hers ', ' herself ', ' shes ', ' woman ', ' ms. ', ' ms ', ' mrs. ', ' mrs ', ' women ']}

In [5]:
## Function to figure out which gender the news article is about

def gender_analysis(text):
    
    # Counting the number of gendered words for male and female
    male_count_keyword = sum(text.count(word) for word in gender_keywords['male'])
    female_count_keyword = sum(text.count(word) for word in gender_keywords['female'])

    # Figuring out the dominating gender by comparing the counts
    # A gender is only considered dominating when it is occured at least 25% more frequent than the less frequent gender
    if male_count_keyword >= 1.25 * female_count_keyword:
        return 'male'
    elif female_count_keyword >= 1.25 * male_count_keyword:
        return 'female'
    else:
        return 'neutral'
    

In [6]:
## Applying the function to the content column

df['gender'] = np.nan

for i in range(len(df)):
    df['gender'][i] = gender_analysis(df['content'][i])

df.head()

Unnamed: 0,title,content,author,publication,content_lemmatized,sentiment,compound_score,gender
0,house republicans fret winning health care sui...,washington congressional republicans new fear ...,Carl Hulse,New York Times,washington congressional republican new fear c...,positive,0.6497,male
1,rift officers residents killings persist south...,bullet shells get counted blood dries votive c...,Benjamin Mueller and Al Baker,New York Times,bullet shell get counted blood dry votive cand...,negative,-0.9999,male
2,tyrus wong bambi artist thwarted racial bias d...,walt disney bambi opened critics praised spare...,Margalit Fox,New York Times,walt disney bambi opened critic praised spare ...,positive,0.9888,male
3,among deaths heavy toll pop music new york times,death may great equalizer necessarily evenhand...,William McDonald,New York Times,death may great equalizer necessarily evenhand...,negative,-0.8609,male
4,kim jong un says north korea preparing test lo...,seoul south korea north korea leader kim said ...,Choe Sang-Hun,New York Times,seoul south korea north korea leader kim said ...,positive,0.9789,male


In [7]:
## gender column value counts

df['gender'].value_counts()

male       109798
female      26131
neutral      6497
Name: gender, dtype: int64

In [8]:
## saving the data

df.to_csv(r'data\articles_gender.csv', index=False)