In [77]:
import pandas as pd
import nltk
import networkx as nx

In [78]:
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/lucasgautheron/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lucasgautheron/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/lucasgautheron/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [79]:
words = nltk.tokenize.word_tokenize("This is a sentence. Here's another one.")

In [80]:
lemmatizer = nltk.stem.WordNetLemmatizer()
words = [
    lemmatizer.lemmatize(word) for word in words
]

In [81]:
nltk.tag.pos_tag(words)

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sentence', 'NN'),
 ('.', '.'),
 ('Here', 'RB'),
 ("'s", 'VBZ'),
 ('another', 'DT'),
 ('one', 'NN'),
 ('.', '.')]

In [82]:
nouns = [
    word
    for word, tag in nltk.tag.pos_tag(words)
    if tag == "NN"
]
nouns

['sentence', 'one']

In [83]:
def text_to_nouns(text):
    text = text.lower()
    words = nltk.tokenize.word_tokenize(text)
    tags = nltk.tag.pos_tag(words)
    nouns = [
        word
        for word, tag in tags
        if tag == "NN"
    ]
    return nouns

In [84]:
df = pd.read_parquet("science/climate/articles.parquet")
df["title"].fillna(value=" ", inplace=True)
df["nouns"] = df["title"].map(text_to_nouns)
df["nouns"]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["title"].fillna(value=" ", inplace=True)


article_id
4393546166                               [al, uncertainty, job]
4393714386                               [al, uncertainty, job]
4220805390                                             [europe]
4220871093    [carbon, footprint, assessment, decarbonisatio...
4224019370                        [future, cryosphere, warming]
                                    ...                        
4386855500    [accounting, time, greenhouse, gas, investment...
4389633312       [sukuk, arabia, sustainability, light, vision]
4389862110                          [esg, management, thailand]
4389881210             [credibility, bond, market, case, china]
4390419499    [development, finance, eaeu, assessment, liqui...
Name: nouns, Length: 231101, dtype: object

In [85]:
document_frequency = dict()

for nouns in df["nouns"].tolist():
    for noun in set(nouns):
        document_frequency[noun] = document_frequency.get(noun, 0)+1

In [86]:
frequent_nouns = [
    noun for noun in document_frequency.keys() if document_frequency[noun] >= 100
]

In [143]:
frequent_nouns[:10]

['uncertainty',
 'al',
 'europe',
 'development',
 'footprint',
 'assessment',
 'strategy',
 'carbon',
 'future',
 'warming']

In [121]:
G = nx.Graph()
for nouns in df["nouns"]:
    nouns = [noun for noun in nouns if noun in frequent_nouns]
    nouns = list(set(nouns))
    for i, a in enumerate(nouns):
        for j, b in enumerate(nouns):
            if i<j:
                if G.has_edge(a, b):
                    G[a][b]["coocc"] += 1
                else:
                    G.add_edge(a, b, coocc=1)

In [111]:
nx.write_gexf(G, "output/coocc.gexf")

In [122]:
import numpy as np

words = list(G.nodes)
N = len(df)
occ = {
    word: document_frequency[word]/N for word in words
}

weight = dict()
for a, b, attrs in G.edges(data=True):
    weight[(a,b)] = np.log(occ[a]*occ[b])/np.log(attrs["coocc"]/N) - 1

nx.set_edge_attributes(G, weight, "weight")
G.remove_edges_from([(a,b) for a, b, attrs in G.edges(data=True) if attrs["weight"]<0 or attrs["coocc"]<15])

In [123]:
nx.write_gexf(G, "output/coocc.gexf")

In [125]:
authors = pd.read_parquet("science/climate/authors.parquet")
articles_authors = pd.read_parquet("science/climate/articles_authors.parquet")

In [127]:
articles_authors = articles_authors.merge(authors, left_on="author_id", right_index=True)

In [145]:
article_gender = articles_authors.groupby("article_id").agg(
    male = ("gender", lambda g: np.sum(g=="m")),
    female = ("gender", lambda g: np.sum(g=="f"))
)

In [146]:
df = df.merge(article_gender, how="inner", left_index=True, right_index=True)

In [148]:
def term_frequency(df):
    document_frequency = dict()

    for nouns in df["nouns"].tolist():
        for noun in set(nouns):
            document_frequency[noun] = document_frequency.get(noun, 0)+1

    for noun in document_frequency.keys():
        document_frequency[noun] /= len(df)

    return document_frequency

male_freq = term_frequency(df[df["male"]>df["female"]])
female_freq = term_frequency(df[df["male"]<df["female"]])

In [168]:
freq = []
for noun in frequent_nouns:
    freq.append({
        "noun": noun,
        "male": male_freq.get(noun, 0),
        "female": female_freq.get(noun, 0)
    })

In [169]:
freq = pd.DataFrame(freq)
freq

Unnamed: 0,noun,male,female
0,uncertainty,0.010183,0.005961
1,al,0.003045,0.002054
2,europe,0.009708,0.006804
3,development,0.016462,0.022394
4,footprint,0.002153,0.002219
...,...,...,...
1261,biogeography,0.001048,0.000605
1262,iucn,0.000254,0.000183
1263,birdlife,0.000000,0.000000
1264,v0.1,0.001162,0.000825


In [170]:
freq["ratio"] = freq["female"]/freq["male"]

In [171]:
# male dominated

freq.sort_values("ratio", ascending=True).head(10)

Unnamed: 0,noun,male,female,ratio
1253,ssc,8e-06,0.0,0.0
1224,lidar,0.001293,0.000238,0.184348
1122,cop27,0.000753,0.000147,0.194829
1180,so2,0.000696,0.000147,0.210874
890,stata,0.001965,0.000422,0.214718
1005,turbulence,0.000737,0.000165,0.224054
1236,beech,0.0009,0.00022,0.244422
1036,hadley,0.000671,0.000165,0.245913
759,clouds,0.000884,0.00022,0.248948
558,arabia,0.000737,0.000202,0.273843


In [173]:
# female dominated

freq.sort_values("ratio", ascending=False).head(10)

Unnamed: 0,noun,male,female,ratio
1132,nursing,0.000229,0.002036,8.882126
500,student,0.000172,0.0011,6.401532
1138,pm2.5,0.000196,0.001082,5.507985
436,gender,0.001097,0.005704,5.200051
1173,anxiety,0.000409,0.001834,4.481073
1164,youth,0.000393,0.001742,4.434395
1158,pregnancy,0.000262,0.001155,4.411056
393,scoping,0.000458,0.001981,4.321034
822,yangtze,0.000761,0.003045,3.999237
75,school,0.000434,0.001724,3.973781
