In [1]:
import data_io
import pandas as pd
import numpy as np
import regex as re
import utils as u

Read in manually coded datasets

In [14]:
schools = ['Princeton', 'Harvard', 'Yale', 'Columbia','MIT']

dfs = {}
for s in schools:
    dfs[s] = pd.read_excel(data_io.DATA+f"{s.lower()}_data_2019_formatted.xlsx")

In [15]:
for k in dfs.keys():
    print(len(dfs[k]))
    dfs[k] = dfs[k].loc[1:,:]
    print(len(dfs[k]))

229
228
196
195
135
134
164
163
182
181


In [5]:
image_cols = ['article_id','first_researcher_pictured',
       'num_male_researchers_pictured', 'num_female_researchers_pictured',
       'num_other_researchers_pictured', 'num_women_pictured',
       'num_men_pictured', 'num_other_people_pictured']
article_text_cols = ['article_id','gender_first_researcher', 'num_women_mentioned',
       'num_men_mentioned', 'num_others_mentioned']

In [6]:
def check_completeness_text(df):
    art_df = df.copy()
    no_dups = art_df.dropna(subset = ['article_id'])
    no_dups = art_df.drop_duplicates(subset = ['article_id'], keep = 'first')
    no_gender = no_dups[pd.isnull(no_dups['gender_first_researcher'])]
    return no_gender

def check_completeness_photos(df):
    art_df = df.copy()
    images_only = art_df[image_cols]
    no_gender = images_only[pd.isnull(images_only['first_researcher_pictured'])]
    return no_gender
    


In [7]:
incomplete_text = {}
incomplete_photos = {}

In [8]:
for k in dfs.keys():
    dfs[k] = dfs[k].reset_index(drop = True)
    temp = check_completeness_text(dfs[k])
    incomplete_text[k] = temp
    print(k, ' number w incomplete text coding: ')
    print(len(temp))
    temp = check_completeness_photos(dfs[k])
    incomplete_photos[k] = temp
    print(k, ' number w incomplete image coding: ')
    print(len(temp))

Princeton  number w incomplete text coding: 
0
Princeton  number w incomplete image coding: 
0
Harvard  number w incomplete text coding: 
0
Harvard  number w incomplete image coding: 
0
Yale  number w incomplete text coding: 
1
Yale  number w incomplete image coding: 
1
Columbia  number w incomplete text coding: 
0
Columbia  number w incomplete image coding: 
2
MIT  number w incomplete text coding: 
0
MIT  number w incomplete image coding: 
0


Look at incomplete articles:

In [10]:
yale = dfs["Yale"]
yale[yale['article_id'] == incomplete_text['Yale'].reset_index().loc[0, 'article_id']]

Unnamed: 0,article_id,article_title,article_link,article_text,article_date,gender_first_researcher,num_women_mentioned,num_men_mentioned,num_others_mentioned,image_alt_text,...,comments,image_location,image_links,article_date_scrape,article_title_scrape,url_redirect,PAGE_TYPE,source,in_og,new_article_text
31,yale_220,Creativity in motion: how Yale engineering put...,https://news.yale.edu/in-focus/creativity-moti...,,NO DATE,,0,1,0,,...,no researchers mentioned in the article,,,"August 28, 2019",Creativity in motion: how Yale engineering put...,https://news.yale.edu/in-focus/creativity-moti...,article,old,,


In [11]:
columbia = dfs['Columbia']
#2 pictures didn't save bc they don't render on the webpage
columbia.loc[pd.isnull(columbia['first_researcher_pictured']),:]

Unnamed: 0,article_id,article_title,article_link,article_date,new_article_text,gender_first_researcher,num_women_mentioned,num_men_mentioned,num_others_mentioned,image_alt_text,...,num_men_pictured,num_women_pictured,num_other_people_pictured,rater_initials,comments,image_location,article_date_scrape,article_title_scrape,new_article_text.1,article_text
27,columbia_124,,,,,,,,,fish stocks,...,,,,,,/Users/elisabethsilver/Box/SIB/photos/columbia...,,,,
75,columbia_160,,,,,,,,,,...,,,,,can't open the file on ricebox,/Users/elisabethsilver/Box/SIB/photos/columbia...,,,,


In [65]:
def clean_dataset(df_orig, uni = None):
    df = df_orig.copy()
    row_counts = pd.DataFrame(df['article_id'].value_counts()).reset_index()
    if 'new_article_text' in df.columns.to_list():
        df = df.drop(columns = ['article_text'])
        df = df.rename(columns = {'new_article_text': 'article_text'})
    row_counts.columns = ['article_id', 'rows']
    row_counts = dict(zip(row_counts['article_id'].to_list(),
                         row_counts['rows'].to_list()))
    df['num_pics'] = df['article_id'].map(row_counts)
    gender_dict = {0: 'Unclear',
                  1: 'Man',
                  2: 'Woman'}
    df['gender_first_researcher_text'] = df['gender_first_researcher'].map(gender_dict)
    grpd = df.groupby(['article_id'])
    num_first_res_pic = {}
    num_fres_pic = {}
    num_mres_pic = {}
    num_women_pic = {}
    num_men_pic = {}
    for name, grp in grpd:
        num_first_res_pic[name] = sum(grp['first_researcher_pictured'])
        num_fres_pic[name] = sum(grp['num_female_researchers_pictured'])
        num_mres_pic[name] = sum(grp['num_male_researchers_pictured'])
        num_women_pic[name] = sum(grp['num_women_pictured'])
        num_men_pic[name] = sum(grp['num_men_pictured'])
    
    df['num_men_mentioned'] = df['num_men_mentioned'].astype(float)
    df['num_women_mentioned'] = df['num_women_mentioned'].astype(float)
    df['num_first_res_pics'] = df['article_id'].map(num_first_res_pic).astype(int)
    df['num_female_res_pics'] = df['article_id'].map(num_fres_pic).astype(int)
    df['num_male_res_pics'] = df['article_id'].map(num_mres_pic).astype(int)
    df['total_men_pics'] = df['article_id'].map(num_men_pic).astype(int)
    df['total_women_pics'] = df['article_id'].map(num_women_pic).astype(int)
    df['total_people_mentioned'] = (df['num_women_mentioned'] + df['num_men_mentioned'] + 
                                    df['num_others_mentioned'])
    
    if uni:
        df['uni'] = uni
    return df

Remove articles that failed to scrape

In [66]:
yale = dfs['Yale']
#One article failed to scrape, remove it
yale = yale[yale['article_id'] != incomplete_text['Yale'].reset_index().loc[0, 'article_id']]

In [67]:
columbia = dfs['Columbia']
#2 pictures didn't save bc they don't render on the webpage
columbia = columbia.dropna(subset=['first_researcher_pictured'])
columbia = columbia.reset_index(drop =True)

In [68]:
yale = clean_dataset(yale, uni = 'yale')
columbia = clean_dataset(columbia, uni = 'columbia')
mit = clean_dataset(dfs['MIT'], uni = 'mit')
princeton = clean_dataset(dfs['Princeton'], uni = 'princeton')
harvard = clean_dataset(dfs['Harvard'], uni = 'harvard')

In [69]:
df = pd.concat([mit, princeton, yale, harvard, columbia], ignore_index=True)

In [70]:
df.dropna(subset=['article_text'])['num_men_mentioned'].isnull().sum()

0

In [71]:
tmp = df.dropna(subset=['article_link', 'gender_first_researcher_text'])
tmp

Unnamed: 0,article_id,article_title,article_link,article_date,gender_first_researcher,num_women_mentioned,num_men_mentioned,num_others_mentioned,image_alt_text,image_captions,...,OLD_article_text_w_main_img_caption,needs_recode,url_redirect,PAGE_TYPE,source,in_og,article_id.1,recode,rater_recode_intials,new_article_text.1
0,mit_1452,Enhanced NMR reveals chemical structures in a ...,https://news.mit.edu/2019/nmr-chemical-structu...,"January 18, 2019",1,1.0,4.0,1,MIT chemists have enhanced the resolution of n...,MIT chemists have enhanced the resol...,...,,,,,,,,,,
1,mit_1266,How biomarkers can record and reconstruct clim...,https://news.mit.edu/2019/how-biomarkers-recor...,"December 4, 2019",1,1.0,2.0,0,The composition and location of rock strata he...,The composition and location of rock...,...,,,,,,,,,,
2,mit_666,"From one brain scan, more information for medi...",https://news.mit.edu/2019/training-artificial-...,"June 19, 2019",2,1.0,4.0,0,MIT researchers have developed a system that g...,MIT researchers have developed a sys...,...,,,,,,,,,,
3,mit_549,New approach suggests path to emissions-free c...,https://news.mit.edu/2019/carbon-dioxide-emiss...,"September 16, 2019",1,1.0,2.0,0,In a demonstration of the basic chemical react...,In a demonstration of the basic chem...,...,,,,,,,,,,
5,mit_673,Researchers solve mystery of how gas bubbles f...,https://news.mit.edu/2019/how-gas-bubbles-form...,"June 17, 2019",1,0.0,4.0,0,Series of images from the team’s lab experimen...,Series of images from the team’s lab...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889,columbia_249,Climate Change Tipping Point Could Be Coming S...,https://science.fas.columbia.edu/news/climate-...,"January 23, 2019",1,1.0,2.0,0,,Limpopo province in South Africa– a semi-arid ...,...,,,,,,,,,,Global carbon emissions reached a record high ...
890,columbia_252,Climate-Driven Fires Could Turn Yellowstone Fo...,https://science.fas.columbia.edu/news/climate-...,"January 17, 2019",1,1.0,1.0,0,,"Lead author Winslow Hansen, at one of his stud...",...,,,,,,,,,,Adapted from a news release by Kelly Tyrrell o...
891,columbia_254,Heating Buildings Leaves a Huge Carbon Footpri...,https://science.fas.columbia.edu/news/heating-...,"January 15, 2019",1,0.0,1.0,0,,Photo: Stephen Downes,...,,,,,,,,,,"As winter weather sets in, the heat kicks on i..."
893,columbia_255,Genes on the Move Help Nose Make Sense of Scents,https://science.fas.columbia.edu/news/genes-on...,"January 9, 2019",1,0.0,3.0,0,,A section of the olfactory epithelium. Cells t...,...,,,,,,,,,,The human nose can distinguish one trillion di...


In [72]:
tmp = tmp.sample(frac = 1)

In [76]:
unclear = tmp[tmp['gender_first_researcher_text'] == 'Unclear']
unclear = unclear.reset_index(drop = True)

In [77]:
for i in unclear.index:
    #These articles don't mention any researchers
    print(unclear.loc[i, 'article_link'])

https://news.mit.edu/2019/qs-world-university-rankings-0226
https://news.mit.edu/2019/graduate-engineering-business-programs-top-rankings-us-news-0312


In [78]:
#drop the articles that don't mention any researchers
tmp = tmp[tmp['gender_first_researcher_text']!='Unclear']
tmp

Unnamed: 0,article_id,article_title,article_link,article_date,gender_first_researcher,num_women_mentioned,num_men_mentioned,num_others_mentioned,image_alt_text,image_captions,...,OLD_article_text_w_main_img_caption,needs_recode,url_redirect,PAGE_TYPE,source,in_og,article_id.1,recode,rater_recode_intials,new_article_text.1
470,yale_267,A tool for identifying phases of matter,https://news.yale.edu/2019/06/19/tool-identify...,"June 19, 2019",1,0.0,6.0,0,Torus topology diagram,(© stock.adobe.com),...,,,https://news.yale.edu/2019/06/19/tool-identify...,div story clearfix,old,,,,,
822,columbia_179,Combating Climate Change With Artificial Intel...,https://science.fas.columbia.edu/news/combatin...,"June 5, 2019",2,2.0,0.0,0,,"Maria Uriarte, left, and Tian Zheng bring ecol...",...,,,,,,,,,,Thinking about what could lie ahead when clima...
561,harvard_150,"Physics, real and fictional",https://news.harvard.edu/gazette/story/2019/11...,"November 14, 2019",1,1.0,3.0,0,Levitating frog.,Photo illustration by Judy Blomquist/Harvard S...,...,,,,,,,harvard_150,0.0,DP,
872,columbia_221,It’s Raining on the Greenland Ice. In the Winter.,https://science.fas.columbia.edu/news/its-rain...,"March 7, 2019",2,1.0,2.0,0,,Increasing rainfall over the Greenland ice she...,...,,,,,,,,,,Rainy weather is becoming increasingly common ...
570,harvard_159,A clue to biodiversity?,https://news.harvard.edu/gazette/story/2019/10...,"October 31, 2019",1,0.0,3.0,0,Heliconius xanthocles butterfly illustration w...,An analysis of 20 butterfly genomes found evid...,...,,,,,,,harvard_159,0.0,DP,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253,princeton_178,Data science tool that reveals molecular cause...,https://www.princeton.edu/news/2019/02/28/data...,"Feb. 28, 2019 1:19 p.m.",2,2.0,1.0,0,Visualization of gene activity,Princeton researchers created a data science t...,...,Princeton University researchers are gaining n...,False,,,,,,,,
78,mit_772,IDEAS challenge showcases social ventures at MIT,https://news.mit.edu/2019/ideas-challenge-soci...,"April 30, 2019",1,3.0,1.0,0,"Kate Trimble (left), associate dean and senior...","Kate Trimble (left), associate dean ...",...,,,,,,,,,,
44,mit_1298,Ruth Lehmann elected as director of Whitehead ...,https://news.mit.edu/2019/ruth-lehmann-elected...,"September 19, 2019",2,2.0,5.0,0,"Ruth Lehmann, a world-renowned developmental a...","Ruth Lehmann, a world-renowned devel...",...,,,,,,,,,,
517,yale_346,New studies confirm existence of galaxies with...,https://news.yale.edu/2019/03/29/new-studies-c...,"March 29, 2019",1,1.0,1.0,0,A photo of the DF2 galaxy,"A photo of the DF2 galaxy (Image credit: NASA,...",...,,,https://news.yale.edu/2019/03/29/new-studies-c...,div story clearfix,old,,,,,


In [89]:
tmp.uni.value_counts()

harvard      100
princeton    100
yale          99
columbia      99
mit           98
Name: uni, dtype: int64

In [80]:
tmp.gender_first_researcher_text.value_counts()

Man      341
Woman    155
Name: gender_first_researcher_text, dtype: int64

In [81]:
pd.crosstab(tmp.gender_first_researcher_text, tmp.uni, margins = True).T

gender_first_researcher_text,Man,Woman,All
uni,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
columbia,71,28,99
harvard,71,29,100
mit,72,26,98
princeton,68,32,100
yale,59,40,99
All,341,155,496


In [82]:
tmp.columns

Index(['article_id', 'article_title', 'article_link', 'article_date',
       'gender_first_researcher', 'num_women_mentioned', 'num_men_mentioned',
       'num_others_mentioned', 'image_alt_text', 'image_captions',
       'photo_filename', 'first_researcher_pictured',
       'num_male_researchers_pictured', 'num_female_researchers_pictured',
       'num_other_researchers_pictured', 'num_women_pictured',
       'num_men_pictured', 'num_other_people_pictured', 'rater_initials',
       'comments', 'image_location', 'image_links', 'article_date_scrape',
       'article_title_scrape', 'article_text', 'num_pics',
       'gender_first_researcher_text', 'num_first_res_pics',
       'num_female_res_pics', 'num_male_res_pics', 'total_people_mentioned',
       'uni', 'OLD_article_text_w_main_img_caption', 'needs_recode',
       'url_redirect', 'PAGE_TYPE', 'source', 'in_og', 'article_id.1',
       'recode', 'rater_recode_intials', 'new_article_text.1'],
      dtype='object')

In [83]:
keep_cols = ['article_id', 'article_title', 'article_link',
            'article_date', 'gender_first_researcher',
             'gender_first_researcher_text',
            'uni', 'article_text']
keep_cols.extend([c for c in tmp.columns.to_list() if '_mentioned' in c or '_pictured' in c or '_pics' in c])

In [84]:
keep_cols

['article_id',
 'article_title',
 'article_link',
 'article_date',
 'gender_first_researcher',
 'gender_first_researcher_text',
 'uni',
 'article_text',
 'num_women_mentioned',
 'num_men_mentioned',
 'num_others_mentioned',
 'first_researcher_pictured',
 'num_male_researchers_pictured',
 'num_female_researchers_pictured',
 'num_other_researchers_pictured',
 'num_women_pictured',
 'num_men_pictured',
 'num_other_people_pictured',
 'num_pics',
 'num_first_res_pics',
 'num_female_res_pics',
 'num_male_res_pics',
 'total_people_mentioned']

In [87]:
tmp_condensed = tmp[keep_cols]
tmp_condensed.to_csv(f"{data_io.DATA}article_data_cleaned.csv", index = False, encoding = 'utf-8-sig')