In [178]:
import pandas as pd
import numpy as nps
import json
import ast
import pickle
import math

import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText

import geopandas
import plotly.express as px
from cartopy.io import shapereader

In [83]:
data = pd.read_pickle('data_full_gender.pkl')

In [84]:
# data = pd.read_csv('../ete3/Plt_sci_publications_geo_species_8.26.csv', low_memory=False)

In [85]:
data = data.fillna('none')

In [86]:
# make all gender entries into lists
data['inferred_gender'] = [[a] if isinstance(a, str) else a for a in data['inferred_gender']]

In [87]:
data['Corresponding author geocoords'] = [i.split('; ') for i in data['Corresponding author geocoords']]

In [88]:
# many of these are one author with multiple affiliations

mult_corresp = []

for i in range(len(data['Corresponding author Addresses'])):
    if ';' in data.iloc[i]['Corresponding author Addresses']:
        mult_corresp.append(i)

# One paper = 1 proportion

In [59]:
data['Corresponding author geocoords'] = [i.split('; ') for i in data['Corresponding author geocoords']]

AttributeError: 'list' object has no attribute 'split'

In [None]:
data['map_props'] = [1 for i in range(len(data))]

In [None]:
out = data.explode('Corresponding author geocoords')
out['map_props'] /= out['map_props'].groupby(level=0).transform('count')

In [None]:
data

# One author = 1 proportion

### Case 1: One author, one location

In [89]:
data1 = data[data['Corresponding author geocoords'].apply(lambda x: len(x) == 1)]
one_one = data1[data1['inferred_gender'].apply(lambda x: len(x) == 1)]

In [160]:
# exploding just to get geocoords and gender out of list
one_one_ex = one_one.explode(['Corresponding author geocoords', 'inferred_gender'])
one_one['map_props'] = [1 for i in range(len(one_one))]

### Case 2: One author, multiple locations

In [91]:
data1 = data[data['Corresponding author geocoords'].apply(lambda x: len(x) != 1)]
one_many = data1[data1['inferred_gender'].apply(lambda x: len(x) == 1)]

In [92]:
one_many['map_props'] = [1 for i in range(len(one_many))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_many['map_props'] = [1 for i in range(len(one_many))]


In [93]:
one_many_ex = one_many.explode(['Corresponding author geocoords'])
one_many_ex['map_props'] /= one_many_ex['map_props'].groupby(level=0).transform('count')

In [189]:
# explode to get gender out of brackets
one_many_ex = one_many_ex.explode(['inferred_gender'])

### Case 3: Multiple authors, one location

In [94]:
data1 = data[data['Corresponding author geocoords'].apply(lambda x: len(x) == 1)]
many_one = data1[data1['inferred_gender'].apply(lambda x: len(x) != 1)]

In [95]:
many_one_ex = many_one.explode(['inferred_gender', 'first and last names'])

In [130]:
# explode again to get geocoords out of list
many_one_ex = many_one_ex.explode(['Corresponding author geocoords'])

In [131]:
many_one_ex['map_props'] = [1 for i in range(len(many_one_ex))]

### Case 4: Multiple authors, multiple locations

In [97]:
data1 = data[data['Corresponding author geocoords'].apply(lambda x: len(x) != 1)]
many_many = data1[data1['inferred_gender'].apply(lambda x: len(x) != 1)]

In [98]:
# if the # of corresp author addresses = # of geocoords but there is a repeated author in one of the addresses,
# just drop it for now

In [104]:
# these are cases with multiple corresp authors and one cannot be determined. Just drop
to_drop = []
for i in many_many.index:
    if len(many_many.loc[i]['inferred_gender']) !=  len(many_many.loc[i]['Corresponding author geocoords']):
        to_drop.append(i)

In [107]:
many_many = many_many[~many_many.index.isin(to_drop)]

In [108]:
many_many_ex = many_many.explode(['inferred_gender','Corresponding author geocoords'])

In [115]:
many_many_ex['map_props'] = [1 for i in range(len(many_many_ex))]

# Mapping

In [190]:
out = pd.concat([one_one_ex, one_many_ex, many_one_ex, many_many_ex])

In [191]:
out['old_index'] = out.index

In [192]:
out = out.reset_index(drop=True)

In [193]:
# do this after exploding since exploding needs to be on strings
out['Corresponding author geocoords'] = [ast.literal_eval(x) for x in out['Corresponding author geocoords']]

In [194]:
out = out.join(out['Corresponding author geocoords'].apply(pd.Series)).rename(columns={0:'Latitude', 1:'Longitude'})

In [195]:
# these are polygons representing states and provinces
shpfilename = shapereader.natural_earth('10m', 'cultural', 'admin_1_states_provinces')
# shpfilename2 = shapereader.natural_earth('10m', 'cultural', 'admin_1_states_provinces_lines')

In [196]:
df2 = geopandas.read_file(shpfilename, encoding='utf-8')

In [197]:
gdf = geopandas.GeoDataFrame(
    out, geometry=geopandas.points_from_xy(out.Longitude, out.Latitude))

In [198]:
# about 2.5k missing locations
gdf_out = geopandas.sjoin(gdf, df2[['diss_me', 'geometry']], how='left', op='within')

  if await self.run_code(code, result, async_=asy):


In [199]:
missing_locs = list(gdf_out[gdf_out['diss_me'].isnull()].index)

In [200]:
# takes like 3 min
fixed_locs = [df2[['geometry']].distance(gdf_out.loc[i]['geometry']).sort_values().index[0] for i in missing_locs]

In [201]:
for i in range(len(missing_locs)):
    gdf_out.loc[missing_locs[i], 'diss_me'] = df2.loc[fixed_locs[i]]['diss_me']
    gdf_out.loc[missing_locs[i],'index_right'] = fixed_locs[i]

In [202]:
gdf_out.astype({'diss_me': 'int64'}).dtypes

Author Full Names         object
Article Title             object
Journal                   object
impact factor (2020)     float64
Author Keywords           object
                          ...   
Latitude                 float64
Longitude                float64
geometry                geometry
index_right              float64
diss_me                    int64
Length: 67, dtype: object

In [203]:
gender_props = gdf_out[['map_props','diss_me','inferred_gender']]

In [204]:
gender_props = gender_props.dropna(axis=0, subset='inferred_gender')

In [205]:
# gender_props = gender_props[~gender_props['inferred_gender'].isin(['unknown'])]

In [206]:
gender_props = gender_props.replace({'male': 0, 'female': 1, 'none': math.nan, 'unknown': math.nan})

In [207]:
gdf_out['inferred_gender']

0           male
1         female
2         female
3         female
4         female
           ...  
320638      male
320639      male
320640      male
320641      male
320642       NaN
Name: inferred_gender, Length: 320643, dtype: object

In [208]:
gender_props.astype({'inferred_gender': 'float64'}).dtypes

map_props          float64
diss_me            float64
inferred_gender    float64
dtype: object

In [209]:
gender_props['inferred_gender_props'] = gender_props['map_props']*gender_props['inferred_gender']

In [210]:
final_numbers = gender_props.groupby(by=['diss_me']).sum()

In [211]:
final_numbers = final_numbers.reset_index()

In [212]:
final_numbers['final_gender_prop'] = final_numbers['inferred_gender_props']/final_numbers['map_props']

In [213]:
# Opening JSON file
f = open('admin1.geojson')
  
# returns JSON object as 
# a dictionary
geo_data = json.load(f)
  
# Closing file
f.close()

In [1]:
fig = px.choropleth(final_numbers, geojson=geo_data, color="final_gender_prop",
                    locations="diss_me", featureidkey="properties.id",
                    projection="robinson"
                   ).update_layout(
    title_text="L'état de la sous-nutrition dans le monde en 2017",
    geo=dict(
        # landcolor="lightgray",
        # showland=True,
        showcountries=True,
        countrycolor="gray",
        countrywidth=0.5,
        showframe=False,
        showcoastlines=False,
        # projection_type="equirectangular",
    ))
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.update_traces(marker_line_width=0)

fig.show()
fig.write_html("map.html")

NameError: name 'px' is not defined