# Coverage Biases
All calculations related to differences in coverage between male and female politicians on Reddit.

In [1]:
import pandas as pd
from ast import literal_eval
import glob
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import norm
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt

In [None]:
def cohend(d1, d2):
        # calculate the size of samples
        n1, n2 = len(d1), len(d2)
        # calculate the variance of the samples
        s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
        # calculate the pooled standard deviation
        s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
        # calculate the means of the samples
        u1, u2 = np.mean(d1), np.mean(d2)
        # calculate the effect size
        return (u1 - u2) / s

In [None]:
# Colours
WOMEN = "#ff876d"
WOMEN_POS = "#ffcea8"
WOMEN_NEG = "#cb4d36"
MEN = "#009ce4"
MEN_POS = "#76efff"#"#b8e1ff"
MEN_NEG = "#056497"
ACCENT = "#d2518f"
ACCENT2 = "#987ad3"

In [None]:
# df = pd.read_csv("C:/Users/vanki/partisan.csv")
dfs = []
for file in glob.glob("C:/Users/vanki/201*.csv"): ## Collect dataset
    dfs.append(pd.read_csv(file))
    
# # Remove any corruptions in dataset
# df = pd.concat(dfs).drop(columns=["Unnamed: 0"])
# df.dropna(subset=['Adjectives'], inplace=True) # somekind of mistake

df = df[df['sex'].isin(['male','female'])]
df.drop_duplicates('id', keep=False, inplace=True) # only look at comments that talk about ONE politician

In [None]:
# # To look at cross-partisan subsets only
# left = ['Liberal', 'SocialDemocracy', 'socialism', 'alltheleft', 'neoliberal', 'democrats']
# right = ['Libertarian', 'Conservative', 'Republican']
# alt_right = ['The_Donald']

# full_df = df
# maps = {}
# for sr in left:
#     maps[sr] = 'left'
# for sr in right:
#     maps[sr] = 'right'
# maps['The_Donald'] = 'alt_right'
# df['group'] = df['subreddit'].map(maps)
# df.dropna(subset=['group'], inplace=True)

## Number of comments
Significance calculated in RStudio

In [None]:
## First check wikidata for bias
import json
with open('politician_data_final.json') as f: ## Wikidata collection of politicians
    wikidata = json.load(f)

all_females = []
all_males = []
for entry in wikidata:
    if entry['sex_or_gender'] == 'female':
        all_females.append(entry['id'])
    elif entry['sex_or_gender'] == 'male':
        all_males.append(entry['id'])
print(len(all_females), len(all_males))

In [None]:
print(df[df['sex'] == 'female'].shape[0])
print(df[df['sex'] == 'male'].shape[0])

### Cross-partisan

In [None]:
print("Left", df[(df['group'] == 'left') & (df['sex'] == 'female')].shape[0] / df[(df['group'] == 'left')].shape[0])
print("Right", df[(df['group'] == 'right') & (df['sex'] == 'female')].shape[0] / df[(df['group'] == 'right')].shape[0])
print("Alt-right", df[(df['group'] == 'alt_right') & (df['sex'] == 'female')].shape[0] / df[(df['group'] == 'alt_right')].shape[0])

## Number of comments per entity

In [None]:
#  Get counts per entitty
men_ct = df[df.sex == 'male'].groupby('NEL').count()['body']
fem_ct = df[df.sex == 'female'].groupby('NEL').count()['body']

In [None]:
# Parametric test
from scipy.stats import mannwhitneyu

mannwhitneyu(men_ct, fem_ct, alternative='two-sided')

In [None]:
# Non-parametric test

from scipy.stats import ks_2samp
ks_2samp(men_ct, fem_ct)

In [None]:
## Visualization
values, base = np.histogram(men_ct, bins=50000)
cumulative = np.cumsum(values)/values.sum()
### plot the cumulative function
plt.plot(base[:-1], 1-cumulative, c=MEN, label="Men")
values, base = np.histogram(fem_ct, bins=50000)
cumulative = np.cumsum(values)/values.sum()
## plot the cumulative function
plt.plot(base[:-1], 1-cumulative, c=WOMEN, label= "Women")
plt.xlabel("In-degree (D)")
plt.title("Cumulative CDFs of Entity In-degree")
plt.ylabel("P(d > D)")
plt.xscale("log")
plt.yscale("log")
plt.legend()
plt.savefig("CDFs.png")
plt.show()

In [None]:
# # look at only those under 50000?
# THRESHOLD = 5000
# l_mens = mens.groupby('NEL').filter(lambda group: len(group) < THRESHOLD)
# l_fems = fems.groupby('NEL').filter(lambda group: len(group) < THRESHOLD)
# values, base = np.histogram(l_mens.groupby('NEL').count()['body'], bins=150)
# cumulative = np.cumsum(values)/values.sum()
# ### plot the cumulative function
# plt.plot(base[:-1], 1-cumulative, c='blue', label="Men")
# values, base = np.histogram(l_fems.groupby('NEL').count()['body'], bins=150)
# cumulative = np.cumsum(values)/values.sum()
# ## plot the cumulative function
# plt.plot(base[:-1], 1-cumulative, c='red', label= "Women")
# plt.xlabel("In-degree")
# plt.xscale("log")
# plt.ylabel("P(d > D)")
# plt.yscale("log")
# plt.legend()
# plt.show()

### Cross-partisan

In [None]:
# Cross-partisan
cts = []
for group in ['left', 'right', 'alt_right']:
    for gender in ['male', 'female']:
        cts.append(df[(df['sex'] == gender) & (df['group'] == group)].groupby('NEL').count()['body'])

In [None]:
# in-degree of politician
#import matplotlib.pyplot as plt

groups = ["Left", "Right", "Alt-Right"]
fig, ax = plt.subplots(1, 3, figsize=(15,5), sharey=True, sharex=True)
fig.suptitle('Cumulative CDFs of Entity In-degree Across Partisanship', size=16)
c = 0
for i in range(3):
    for j in range(2):
        values, base = np.histogram(cts[c], bins=50000)
        cumulative = np.cumsum(values)/values.sum()
        color = MEN if j == 0 else WOMEN
        l = "Men" if j == 0 else "Women"
        ax[i].plot(base[:-1], 1-cumulative, c=color, label=l)
        c += 1
    ax[i].set_title(groups[i])
    ax[i].set_xlabel("In-degree (D)")
#     ax[i].set_ylabel("P(d > D)")
    ax[i].set_yscale("log")
    ax[i].set_xscale("log")
    ax[i].grid(alpha=0.3)

ax[0].set_ylabel("P(d > D)")
ax[2].legend()
plt.savefig("CDFs_cross.png")
plt.show()
# plt.show()

## Comment length
Partisan significance calculated in RStudio

In [None]:
df['len'] = df.body.map(lambda x: len(x.split(' '))) #Split at spaces
# singles[['group','sex','len']].to_csv("lens.csv", index=False)

In [None]:
# Mean and standard devation for reporting
print(df[(df['sex'] == 'female') &(df['group'] == group)]['len'].mean(),df[(df['sex'] == 'female') &( df['group'] == group)]['len'].std())
print(df[(df['sex'] == 'male') &(df['group'] == group)]['len'].mean(),df[(df['sex'] == 'male') &( df['group'] == group)]['len'].std())

In [None]:
df['length'] = df.body.map(lambda x: len(x.split()))
print('COVERAGE BIASES')
print("Female", df[df.sex == 'female'].length.mean(), "Male", df[df.sex == 'male'].length.mean(), "+-", df[df.sex == 'male'].length.std())
print(ttest_ind(df[df.sex == 'female'].length.dropna(), df[df.sex == 'male'].length.dropna()), cohend(df[df.sex == 'female'].length.dropna(), df[df.sex == 'male'].length.dropna()))
##### CALCULATE FOR SUBSET IN R TO DO 2-WAY ANOVAS
df.dropna(subset=['group'])[['length','sex','group']].to_csv("partisan_lengths.csv")

In [None]:
import matplotlib.patches as mpatches

vecs = []
colors=[]
for g in ['left','right','alt_right']:
    for s in ['male', 'female']:
        vecs.append(test[(test.sex==s) &(test.group==g)]['len'])
        if s == 'male':
            colors.append(MEN)
        else:
            colors.append(WOMEN)
        
plt.figure(figsize=(12,7))
loc = 1
switch = False
for i in range(len(vecs)):
    plt.boxplot(vecs[i], positions=[loc], widths=[0.5], showfliers=False, patch_artist=True,
             boxprops=dict(color=colors[i],alpha=0.8, facecolor=colors[i]), medianprops=dict(color=ACCENT),
                whiskerprops=dict(color=colors[i]), capprops=dict(color=colors[i]))
    loc += 1
    if switch:
        loc += 1
    switch = not switch
plt.xticks([1.5,4.5,7.5], ['Left','Right','Alt right'],size=16)

mn_patch = mpatches.Patch(color=MEN, label='Men')
wmn_patch = mpatches.Patch(color=WOMEN, label='Women')
l = plt.legend(handles=[mn_patch,wmn_patch],loc="upper right",borderpad=0.6)
i = 0
for text in l.get_texts():
    text.set_color(colors[i])
    text.set_size(16)
    i+= 1
plt.margins(y=0.2)
# plt.yticks([0.,0.2,0.4,0.6,0.8,1.0])
plt.ylim(0)
plt.ylabel("Comment length", size=14)
plt.title("Length of comment across the partisan/gender divides", size=16)
plt.savefig("figures/comment_length.tiff",dpi=300)
plt.show()

for vec in vecs:
    print(vec.mean(), vec.shape[0])

### Cross-partisan

In [None]:
## Mean and Standard Deviations
for group in ['left', 'right', 'alt_right']:
    print(group)
    print(df[(df['sex'] == 'female') &( df['group'] == group)]['len'].mean(),df[(df['sex'] == 'female') &( df['group'] == group)]['len'].std())
    print(df[(df['sex'] == 'male') &( df['group'] == group)]['len'].mean(),df[(df['sex'] == 'male') &( df['group'] == group)]['len'].std())

In [None]:
## Significance test
for group in ['left', 'right', 'alt_right']:
    print(ttest_ind(df[(df['sex'] == 'female') &( df['group'] == group)]['len'], df[(df['sex'] == 'male') & (df['group'] == group)]['len']))

In [None]:
# Visualize

lens = []
plt.figure(figsize=(10,5))
for group in ['left', 'right', 'alt_right']:
    lens.append(df[(df['sex'] == 'male') &( df['group'] == group)]['len'])
    lens.append(df[(df['sex'] == 'female') &( df['group'] == group)]['len'])
loc = 1
switch = False
for i in range(len(lens)):
    plt.boxplot(lens[i], positions=[loc], widths=[0.5], patch_artist=True,
             boxprops=dict(color=colors[i],alpha=0.8, facecolor=colors[i]), medianprops=dict(color=ACCENT),
                whiskerprops=dict(color=colors[i]), capprops=dict(color=colors[i]), showfliers=False)
    loc += 1
    if switch:
        loc += 1
    switch = not switch
plt.xticks([1.5,4.5,7.5], ['Left','Right','Alt right'],size=14)

mn_patch = mpatches.Patch(color=MEN, label='Men')
wmn_patch = mpatches.Patch(color=WOMEN, label='Women')
l = plt.legend(handles=[mn_patch,wmn_patch],loc="upper right",borderpad=0.6, edgecolor='white')
i = 0
for text in l.get_texts():
    text.set_color(colors[i])
    text.set_size(16)
    i+= 1
    
plt.title("Length of comments across partisan/gender divide", size=16)
plt.savefig("Lengths.png")
plt.show()