# Code for exploring Russian gender roles in the Russian National Corpus

In [8]:
import re #importing for cleaning corpus
import os #importing for working with directories
import nltk #importing for corpus manipulations
import string #importing for string manipulations in the corpus
from bs4 import BeautifulSoup #importing for dealing with the HTML format of the corpus files
from nltk.tokenize import sent_tokenize #importing for tokenizing sentences later on
from nltk.corpus import stopwords #importing for later cleaning stopwords for more efficiency in word2vec training
from pymystem3 import Mystem #a package for Russian lemmatisation

punct = string.punctuation + '«» ' #adding space and russian quotation marks to the punctuation in string package to clean corpus

# Part 1: Training whole corpus

In [314]:
directory = #ask for permission to download the RNC 

rnc = [] #list of all text in the whole corpus
for folder in os.listdir(directory): #looping over each folder in the corpus directory
    if folder != ".DS_Store": #except the invisible folder
        for filename in os.listdir(os.path.join(directory, folder)): #looping over each file in the folder
            if filename.endswith('.xhtml'): #only if it's in the xhtml format
                with open(os.path.join(directory, folder, filename), encoding='windows-1251', mode='r') as f: #opening
                    content = f.read() #reading
                    soup = BeautifulSoup(content) #souping the content from html file
                    data = soup.get_text() #getting the text data
                    sent_text = " ".join([x.replace("`","") for x in data.split("\n")]) #all sentences joined in a string after sentences are split by \n and each word gets cleaned from the `
                    rnc.append(sent_text) #appending the sentences of the files of the folder to the RNC list

rnc_sent = " ".join(rnc) #joining all the sentences from all the files from all the folders of the corpus

In [5]:
len(rnc_sent)

7706139

In [316]:
#inspired by: https://www.kaggle.com/alxmamaev/how-to-easy-preprocess-russian-text 

#Create lemmatizer and stopwords list
stem = Mystem() 
russian_stopwords = stopwords.words("russian")

def clean_lemm_text(corpus): #cleaning and lemmatising function
    clean_text = [] #list that will be populated with the clean sentences
    tok = [word.lower() for word in sent_tokenize(corpus)] #tokenise the sentences in the corpus and lower all words
    tok = [stem.lemmatize(sentence) for sentence in tok] #lemmatise each sentence in the tokenised corpus
    for sent in tok:
        clean_sent = [x.strip(punct) for x in sent if re.match(r"[а-я]",x.strip(punct)) #take punctuation away and only take words
                      and len(x.strip(punct))>2 #only token that is longer than 2
                      and x.strip(punct) not in russian_stopwords #token is not a stopword
                      and not re.match(r".*(-то)", x.strip(punct)) #taking some more words that are stopwords but that the stopword package doesn't catch
                      and x.strip(punct) != " "] #token is not an empty space
        clean_text.append(clean_sent) #append to the clean text list
        
    return clean_text #return the list with clean sentences

In [317]:
ready_rnc = clean_lemm_text(rnc_sent)

In [12]:
len(ready_rnc) #number of sentences

96037

In [14]:
#calculating total number of words

i = 0
for item in ready_rnc:
    for word in item:
        i += 1

i

652987

# Training the model and obtaining similarity scores

In [34]:
import gensim #for training
from gensim.models import Word2Vec #for the model
from gensim.models import KeyedVectors #for later comparing similarities

In [52]:
model = Word2Vec(sentences=ready_rnc, vector_size=200, window=30) #training model
model.save("word2vec.model") #saving the model
model = Word2Vec.load("word2vec.model") #loading the model

In [231]:
male_attributes = ["мальчик", "мужчина", "сын", "отец", "дедушка", "дядя", "брат", "папа"]
female_attributes = ["женщина", "девочка", "сестра", "дочь", "мать", "мама", "тетя", "бабушка"]

In [88]:
career_targets = ["работа", "профессионал", "управление", "сила", "зарплата", "офис", "бизнес", "карьера"]
family_targets = ["дом", "родитель", "ребенок", "семья", "свадьба", "брак", "родственник", "убирать"]

In [240]:
def male_female_att_target(targets, mod):
    """Function that takes the target words and a model and returns a list of
    pairwise similarities between each attribute word and each target word"""
    sim_word_male = []
    sim_word_female = []
    for word in targets:
        try:
            for attribute in male_attributes:
                sim_male = attribute + " " + word + ": " + str(mod.wv.similarity(attribute, word))
                sim_word_male.append(sim_male)
            for attribute_f in female_attributes:
                sim_female = attribute_f + " " + word + ": " + str(mod.wv.similarity(attribute_f, word))
                sim_word_female.append(sim_female)
        except KeyError as e:
            print(e)
    
    return sim_word_male, sim_word_female
    
  

In [241]:
male_female_att_target(career_targets, model)

(['мальчик работа: 0.34076083',
  'мужчина работа: 0.25905728',
  'сын работа: 0.021256575',
  'отец работа: 0.09984532',
  'дедушка работа: 0.16657081',
  'дядя работа: 0.39386773',
  'брат работа: 0.3252747',
  'папа работа: 0.26648343',
  'мальчик профессионал: 0.91836226',
  'мужчина профессионал: 0.8598057',
  'сын профессионал: 0.7220744',
  'отец профессионал: 0.777825',
  'дедушка профессионал: 0.8476547',
  'дядя профессионал: 0.9456586',
  'брат профессионал: 0.91368634',
  'папа профессионал: 0.8936032',
  'мальчик управление: 0.4683205',
  'мужчина управление: 0.3621707',
  'сын управление: 0.13216874',
  'отец управление: 0.22359382',
  'дедушка управление: 0.30581433',
  'дядя управление: 0.52606297',
  'брат управление: 0.44368625',
  'папа управление: 0.3964939',
  'мальчик сила: 0.6899024',
  'мужчина сила: 0.6270487',
  'сын сила: 0.42129895',
  'отец сила: 0.4939908',
  'дедушка сила: 0.58378524',
  'дядя сила: 0.74126756',
  'брат сила: 0.70445913',
  'папа сила: 0.

In [243]:
def male_female_targets(targets, mod):
    """Function that takes a target list and a model and returns a list of two lists
    with similarity scores between male attributes and targets, and female attributes and targets"""
    sim_word_male = []
    sim_word_female = []
    for word in targets:
        try:
            for attribute in male_attributes:
                sim_male = mod.wv.similarity(attribute, word)
                sim_word_male.append(sim_male)
            for attribute_f in female_attributes:
                sim_female = mod.wv.similarity(attribute_f, word)
                sim_word_female.append(sim_female)
        except KeyError as e:
            print(e)
    
    return sim_word_male, sim_word_female

In [244]:
career_sims = male_female_targets(career_targets, model)
fam_sims = male_female_targets(family_targets, model)

male_career = career_sims[0]
female_career = career_sims[1]

male_fam = fam_sims[0]
female_fam = fam_sims[1]

## t-test career

In [295]:
from statistics import mean

In [276]:
from scipy.stats import ttest_ind
t, p = ttest_ind(male_career, female_career)

print(mean(male_career), mean(female_career), p/2, p/2 < 0.05, t)


0.5886711 0.54245496 0.18881456238306776 False 0.8853970143673152


## t-test family

In [280]:
t, p = ttest_ind(female_fam, male_fam)

print(mean(male_fam), mean(female_fam), p/2, p/2 < 0.05, t)



0.8721906 0.86063564 0.2381967060447348 False -0.7142504755770632


In [281]:
t, p = ttest_ind(male_fam, female_fam)

print(mean(male_fam), mean(female_fam), p/2, p/2 < 0.05, t)




0.8721906 0.86063564 0.2381967060447348 False 0.7142504755770632


# Part 2: Training subsets of corpus

## Getting texts from the folders

In [318]:
def text_from_folder(directory):
    text = [] #list of all text in the whole corpus
    for filename in os.listdir(directory): #looping over each folder in the corpus directory
        if filename.endswith('.xhtml'): #only if it's in the xhtml format
            with open(os.path.join(directory, filename), encoding='windows-1251', mode='r') as f: #opening
                content = f.read() #reading
                soup = BeautifulSoup(content) #souping the content from html file
                data = soup.get_text() #getting the text data
                sent_text = " ".join([x.replace("`","") for x in data.split("\n")]) #all sentences joined in a string after sentences are split by \n and each word gets cleaned from the `
                text.append(sent_text) #appending the sentences of the files of the folder to the RNC list

    text_sent = " ".join(text) #joining all the sentences from all the files from all the folders of the corpus
    return text_sent

In [319]:
public_sent = text_from_folder('/Users/sashakenjeeva/Desktop/Masters/Spring 2021/Machine Learning for NLP/Report/sample_ar/TEXTS/public')
public_clean = clean_lemm_text(public_sent)

In [297]:
print(len(public_clean), len(private_clean))

20346 26614


In [320]:
private_sent = text_from_folder('/Users/sashakenjeeva/Desktop/Masters/Spring 2021/Machine Learning for NLP/Report/sample_ar/TEXTS/blogs_2013') + text_from_folder('/Users/sashakenjeeva/Desktop/Masters/Spring 2021/Machine Learning for NLP/Report/sample_ar/TEXTS/fiction')
private_clean = clean_lemm_text(private_sent)

## Training public + private

In [203]:
model_pub = Word2Vec(sentences=public_clean, vector_size=200, window=30) #training model
model_pub.save("word2vec_pub.model") #saving the model
model_pub = Word2Vec.load("word2vec_pub.model") #loading the model

In [132]:
model_priv = Word2Vec(sentences=private_clean, vector_size=200, window=30) #training model
model_priv.save("word2vec_priv.model") #saving the model
model_priv = Word2Vec.load("word2vec_priv.model") #loading the model

In [249]:
career_sims_pub = male_female_targets(career_targets, model_pub)
fam_sims_pub = male_female_targets(family_targets, model_pub)

male_career_pub = career_sims_pub[0]
female_career_pub = career_sims_pub[1]

male_fam_pub = fam_sims_pub[0]
female_fam_pub = fam_sims_pub[1]

In [250]:
career_sims_priv = male_female_targets(career_targets, model_priv)
fam_sims_priv = male_female_targets(family_targets, model_priv)

male_career_priv = career_sims_priv[0]
female_career_priv = career_sims_priv[1]

male_fam_priv = fam_sims_priv[0]
female_fam_priv = fam_sims_priv[1]

"Key 'офис' not present"


In [251]:
t, p = ttest_ind(male_career_priv, female_career_priv)

print(mean(male_career_priv), mean(female_career_priv),p/2, p/2 < 0.05)

0.9990602 0.99900645 0.3860187429034055 False


In [252]:
t, p = ttest_ind(male_fam_priv, female_fam_priv)

print(mean(male_fam_priv), mean(female_fam_priv), p/2, p/2 < 0.05)

0.99945027 0.99939835 0.3190647695902723 False


In [253]:
t, p = ttest_ind(male_career_pub, female_career_pub)

print(mean(male_career_pub), mean(female_career_pub), p/2, p/2 < 0.05)

0.9987115 0.9991314 0.0031950275981921746 True


## Calculating differences for career and family, private and public

In [254]:
career_diff_pub = [x-y for x, y in zip(male_career_pub, female_career_pub)]
career_diff_pub

[-8.499622e-05,
 0.00028955936,
 -0.0001861453,
 -9.3102455e-05,
 -0.0024679303,
 -0.0006877184,
 0.00028306246,
 -0.0004760027,
 -0.0001899004,
 0.00033426285,
 -8.159876e-05,
 -5.6028366e-05,
 -0.0028017163,
 -0.00048154593,
 0.0001450777,
 -0.00071656704,
 -5.1140785e-05,
 0.00026386976,
 -0.00030750036,
 -8.940697e-05,
 -0.0022284985,
 -0.0007647276,
 0.00022816658,
 -0.00054204464,
 -0.00010174513,
 0.00031113625,
 -0.00014066696,
 -1.6450882e-05,
 -0.0023688078,
 -0.00065916777,
 0.00027126074,
 -0.00038301945,
 -9.8347664e-05,
 0.00021201372,
 -0.00014746189,
 -7.0512295e-05,
 -0.002319038,
 -0.00059229136,
 0.00034552813,
 -0.0003874302,
 -8.755922e-05,
 -7.1525574e-06,
 -0.00042527914,
 -0.00010704994,
 -0.0015781522,
 -0.0008929372,
 0.0005311966,
 -0.0011210442,
 -5.042553e-05,
 0.0003234148,
 -0.00018000603,
 -2.8073788e-05,
 -0.002355516,
 -0.00064218044,
 0.00024843216,
 -0.0003465414,
 -0.00011527538,
 0.00029093027,
 -3.5703182e-05,
 -4.7683716e-06,
 -0.0025588274,
 -0.

In [255]:
career_diff_priv = [x-y for x, y in zip(male_career_priv, female_career_priv)]
career_diff_priv

[0.0001949668,
 5.8233738e-05,
 -3.3140182e-05,
 0.0001655817,
 -2.6404858e-05,
 -3.516674e-05,
 0.00017613173,
 -6.592274e-05,
 0.00025200844,
 6.2704086e-05,
 -0.00010848045,
 9.3340874e-05,
 3.582239e-05,
 -0.00013643503,
 0.00025129318,
 -4.7683716e-05,
 0.0002439022,
 -3.0100346e-05,
 -4.6491623e-05,
 9.614229e-05,
 0.000114917755,
 5.1677227e-05,
 3.7789345e-05,
 1.7285347e-05,
 0.0001783967,
 4.1604042e-05,
 -6.991625e-05,
 0.000182271,
 6.93202e-05,
 -7.086992e-05,
 0.0001347661,
 -7.861853e-05,
 0.00025838614,
 1.847744e-06,
 -7.671118e-05,
 0.00013720989,
 1.9788742e-05,
 -4.696846e-05,
 9.75728e-05,
 -5.5491924e-05,
 0.0002142787,
 1.5974045e-05,
 -0.000106573105,
 0.00018841028,
 7.098913e-05,
 -2.4974346e-05,
 0.00010532141,
 -2.8192997e-05,
 0.00020134449,
 -0.00024038553,
 1.0728836e-05,
 0.00023543835,
 8.5532665e-05,
 0.00033676624,
 0.00019097328,
 -0.00028657913]

In [265]:
fam_diff_pub = [x-y for x, y in zip(male_fam_pub, female_fam_pub)]
fam_diff_pub

[-0.00013452768,
 0.00031095743,
 -7.045269e-05,
 1.0728836e-05,
 -0.0025051832,
 -0.0006942153,
 0.0003232956,
 -0.00026857853,
 -0.00017899275,
 0.0003182292,
 -0.00010794401,
 3.33786e-05,
 -0.002310276,
 -0.0006733537,
 0.00036221743,
 -0.000207901,
 -0.00014096498,
 0.00031346083,
 -9.23872e-05,
 2.5689602e-05,
 -0.0024131536,
 -0.00073575974,
 0.00036799908,
 -0.00027883053,
 -0.00013750792,
 0.00032258034,
 -0.00011277199,
 -2.527237e-05,
 -0.002439499,
 -0.00068181753,
 0.0003169179,
 -0.0003924966,
 8.404255e-06,
 0.00027680397,
 9.4890594e-05,
 0.00010728836,
 -0.002147019,
 -0.0011420846,
 0.00025093555,
 -0.0003273487,
 -0.0001810193,
 0.000236094,
 -0.00016760826,
 0.00015717745,
 -0.0018885136,
 -0.0006695986,
 0.0005329251,
 -0.0008326173,
 -0.000115811825,
 0.00033056736,
 -1.9073486e-05,
 -4.3988228e-05,
 -0.0020424128,
 -0.0004464388,
 0.00032109022,
 -0.00014203787,
 -8.648634e-05,
 0.00032293797,
 1.835823e-05,
 6.0260296e-05,
 -0.0026510358,
 -0.00043404102,
 0.000

In [263]:
fam_diff_priv = [x-y for x, y in zip(male_fam_priv, female_fam_priv)]
fam_diff_priv

[0.0002644062,
 4.208088e-05,
 -2.2172928e-05,
 0.00010585785,
 -0.00012624264,
 5.4597855e-05,
 0.00020211935,
 -0.00010102987,
 0.00022995472,
 3.85046e-05,
 -1.9073486e-05,
 0.00015819073,
 2.3961067e-05,
 -4.3213367e-05,
 0.00019705296,
 -8.0645084e-05,
 0.00013434887,
 0.00013792515,
 4.041195e-05,
 0.00016659498,
 -0.000119149685,
 -0.0001424551,
 0.0002322793,
 -3.6895275e-05,
 0.00019574165,
 1.9073486e-05,
 1.6748905e-05,
 0.00010627508,
 2.1398067e-05,
 -5.90086e-06,
 0.0001424551,
 -6.66976e-05,
 0.00022530556,
 -3.373623e-05,
 -6.0915947e-05,
 0.00015044212,
 6.0737133e-05,
 5.429983e-05,
 0.00012785196,
 -8.1181526e-05,
 0.00021231174,
 0.00010448694,
 8.058548e-05,
 0.00020623207,
 -9.4771385e-06,
 -0.0002065897,
 4.285574e-05,
 -1.603365e-05,
 0.00030475855,
 -8.106232e-06,
 -7.355213e-05,
 0.00013178587,
 -1.001358e-05,
 -7.867813e-06,
 9.441376e-05,
 -0.00010848045,
 0.00025629997,
 -0.00010442734,
 -7.6293945e-05,
 0.00019568205,
 4.3034554e-05,
 1.5079975e-05,
 9.810

## t-test differences male-female in family private vs. public

In [288]:
t, p = ttest_ind(fam_diff_priv, fam_diff_pub)

print('%.08f' % mean(fam_diff_priv), mean(fam_diff_pub), p, t)

0.00005193 -0.00035688933 0.0001289791186606279 3.950692428561948


## t-test differences male-female in career private vs. public

In [287]:
t, p = ttest_ind(career_diff_priv, career_diff_pub)

print('%.08f' % mean(career_diff_priv), mean(career_diff_pub), '%.08f' %p, t)

0.00005381 -0.00041985326 0.00003376 4.3119469227922895


## t-test male career private vs. public

In [271]:
t, p = ttest_ind(male_career_priv, male_career_pub)

print(mean(male_career_priv), mean(male_career_pub), '%.08f' % p)

0.9990602 0.9987115 0.05747032


## t-test female career private vs. public

In [274]:
t, p = ttest_ind(female_career_priv, female_career_pub)

print(mean(female_career_priv), mean(female_career_pub), '%.013f' % p, p<0.05)

0.99900645 0.9991314 0.4135224025513 False


## t-test female family private vs. public

In [286]:
t, p = ttest_ind(female_fam_priv, female_fam_pub)

print(mean(female_fam_priv), mean(female_fam_pub), '%.011f' % p, t)

0.99939835 0.9985871 0.00005873679 4.158709185885311


## t-test male family private vs. public

In [282]:
t, p = ttest_ind(male_fam_priv, male_fam_pub)

print(mean(male_fam_priv), mean(male_fam_pub),'%.08f' % p, t)

0.99945027 0.9982302 0.00000007 5.716142911132387
