In [122]:
import pandas as pd
import MeCab
import json
from os import listdir
from os.path import isfile, join
import neologdn
import unicodedata
import re

In [146]:
japaneseMasculineWordsList = open('./data/gendered_words_lists/ja/masculine.json', 'r')
japaneseMasculineWords = json.load(japaneseMasculineWordsList)
japaneseFeminineWordsList = open('./data/gendered_words_lists/ja/feminine.json', 'r')
japaneseFeminineWords = json.load(japaneseFeminineWordsList)

In [147]:
# Count the number of gendered words in each job advertisement
dataDirectory = './data/job_advertisements/ja/'
# get the list of file names
files = [f for f in listdir(dataDirectory) if isfile(join(dataDirectory, f)) and ('failed' not in f)]

# declare the dataframe object to save the results
df = pd.DataFrame(index=[], columns=['dataIndex', 'occupation', 'masclineWordCount', 'feminineWordCount', 'wordCount'])
dfByOccupation = pd.DataFrame(index=[], columns=['occupation', 'masclineWordCount', 'feminineWordCount', 'wordCount'])
genderedWordDf = pd.DataFrame(index=[], columns=['word', 'occupation'])

# initiate the tagger to parse and clean Japanese sentences
tagger = MeCab.Tagger('-d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd')
tagger.parse('')
re_hiragana = re.compile(r'^[あ-ん]+$')

dataIndex = 1    
for file in files:
    # open job advertisements file
    jsonOpen = open(dataDirectory + file, 'r')
    jobAdvertisements = json.load(jsonOpen)
    occupation = file.replace('.json', '')
    
    masculineWordCounterPerOccupation = 0
    feminineWordCounterPerOccupation = 0
    wordCounterPerOccupation = 0

    getIndex = 1
    for jobAdvertisement in jobAdvertisements:
        if jobAdvertisement.get(str(getIndex)):
            # clean the sentences
            cleaned_title = neologdn.normalize(unicodedata.normalize("NFKC", jobAdvertisement[str(getIndex)]['title'].replace(' |\n', ''))).lower()
            cleaned_summary = neologdn.normalize(unicodedata.normalize("NFKC", jobAdvertisement[str(getIndex)]['description_summary'].replace(' |\n', ''))).lower()
            cleaned_detail = neologdn.normalize(unicodedata.normalize("NFKC", jobAdvertisement[str(getIndex)]['description_detail'].replace(' |\n', ''))).lower()
            cleaned_desired_person = neologdn.normalize(unicodedata.normalize("NFKC", jobAdvertisement[str(getIndex)]['desired_person'].replace(' |\n', ''))).lower()            
            all_sentences =  cleaned_title + ' ' + cleaned_summary + ' ' + cleaned_detail + ' ' + cleaned_desired_person 
                        
            masculineWordCounterPerAdvertisement = 0
            feminineWordCounterPerAdvertisement = 0
            counter = 0
            
            node = tagger.parseToNode(all_sentences)
            while node:
                counter += 1
                #select specific word types
                if node.feature.split(",")[0] in  ['名詞', '形容詞', '動詞']:
                    if node.feature.split(",")[0] == '': 
                        None
                    elif len(node.feature.split(",")[6]) == 1: 
                        None
                    elif re_hiragana.fullmatch(node.feature.split(",")[6]):
                        None
                    else: 
                        for japaneseMasculineWord in japaneseMasculineWords:
                            if japaneseMasculineWord == node.feature.split(",")[6]:
                                dataIndex += 1
                                masculineWordCounterPerAdvertisement += 1
                                wordData = pd.DataFrame({
                                    'word': japaneseMasculineWord,
                                    'occupation': occupation,
                                }, index=[str(counter)])
                                genderedWordDf = pd.concat([genderedWordDf, wordData])
                                break
                        for japaneseFeminineWord in japaneseFeminineWords:
                            if japaneseFeminineWord == node.feature.split(",")[6]:
                                dataIndex += 1
                                feminineWordCounterPerAdvertisement += 1
                                wordData = pd.DataFrame({
                                    'word': japaneseFeminineWord,
                                    'occupation': occupation,
                                }, index=[str(counter)])
                                genderedWordDf = pd.concat([genderedWordDf, wordData])
                                break
                else:
                    pass
                node = node.next
            # if masculineNumber == 0 and feminineNumber == 0:
            #     x = 0
            # else:
            #     x = (1/(masculineNumber + feminineNumber)) * (masculineNumber - feminineNumber)
            # genderBiasScore = 1 / (1 + math.exp(-x))

            record =  pd.DataFrame({
                'dataIndex': dataIndex,
                'occupation': occupation,
                'masclineWordCount': masculineWordCounterPerAdvertisement,
                'feminineWordCount': feminineWordCounterPerAdvertisement,
                'wordCount': counter
            }, index=[str(getIndex)])
            df = pd.concat([df, record])
            masculineWordCounterPerOccupation += masculineWordCounterPerAdvertisement
            feminineWordCounterPerOccupation += feminineWordCounterPerAdvertisement
            wordCounterPerOccupation += counter
        getIndex += 1

    record =  pd.DataFrame({
        'occupation': occupation,
        'masclineWordCount' : masculineWordCounterPerOccupation,
        'feminineWordCount' : feminineWordCounterPerOccupation, 
        'wordCount' : wordCounterPerOccupation
    }, index=[str(dataIndex)])
    dfByOccupation = pd.concat([dfByOccupation, record])

df.to_csv('./result/japanese_result.csv')
dfByOccupation.to_csv('./result/japanese_result_per_occupation.csv')
genderedWordDf.to_csv('./result/japanese_gendered_words_in_advertisements.csv')
