In [17]:
import pandas as pd
import MeCab
import json
from os import listdir
from os.path import isfile, join
import re

In [18]:
japaneseMasculineWordsList = open('./data/gendered_words_lists/ja/masculine.json', 'r')
japaneseMasculineWords = json.load(japaneseMasculineWordsList)
japaneseFeminineWordsList = open('./data/gendered_words_lists/ja/feminine.json', 'r')
japaneseFeminineWords = json.load(japaneseFeminineWordsList)

In [19]:
# Count the number of gendered words in each job advertisement
dataDirectory = './data/job_advertisements/ja/cleaned/'
# get the list of file names
files = [f for f in listdir(dataDirectory) if isfile(join(dataDirectory, f)) and ('failed' not in f)]

# declare the dataframe object to save the results
df = pd.DataFrame(index=[], columns=['index', 'occupation', 'masculineWordCount', 'feminineWordCount', 'wordCount'])
dfByOccupation = pd.DataFrame(index=[], columns=['index', 'occupation', 'masculineWordCount', 'feminineWordCount', 'wordCount', 'number'])
genderedWordDf = pd.DataFrame(index=[], columns=['index', 'word', 'occupation'])

# initiate the tagger to parse and clean Japanese sentences
tagger = MeCab.Tagger('-d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd')
tagger.parse('')
re_hiragana = re.compile(r'^[あ-ん]+$')

occupationIndex = 0
for file in files:
    occupationIndex += 1
    # open job advertisements file
    jsonOpen = open(dataDirectory + file, 'r')
    jobAdvertisements = json.load(jsonOpen)
    occupation = file.replace('.json', '')
    
    masculineWordCounterPerOccupation = 0
    feminineWordCounterPerOccupation = 0
    wordCounterPerOccupation = 0

    advertisementIndex = 1
    for jobAdvertisement in jobAdvertisements:
        all_sentences =  jobAdvertisement['all_sentences']
        masculineWordCounterPerAdvertisement = 0
        feminineWordCounterPerAdvertisement = 0
        counter = 0
        node = tagger.parseToNode(all_sentences)
        while node:
            counter += 1
            #select specific word types
            if node.feature.split(",")[0] in  ['名詞', '形容詞', '動詞']:
                if node.feature.split(",")[0] == '': 
                    None
                elif len(node.feature.split(",")[6]) == 1: 
                    None
                elif re_hiragana.fullmatch(node.feature.split(",")[6]):
                    None
                else: 
                    for japaneseMasculineWord in japaneseMasculineWords:
                        if japaneseMasculineWord == node.feature.split(",")[6]:
                            masculineWordCounterPerAdvertisement += 1
                            wordData = pd.DataFrame({
                                'index': counter,
                                'word': japaneseMasculineWord,
                                'occupation': occupation,
                            }, index=[str(counter)])
                            genderedWordDf = pd.concat([genderedWordDf, wordData])
                            break
                    for japaneseFeminineWord in japaneseFeminineWords:
                        if japaneseFeminineWord == node.feature.split(",")[6]:
                            feminineWordCounterPerAdvertisement += 1
                            wordData = pd.DataFrame({
                                'index': counter,
                                'word': japaneseFeminineWord,
                                'occupation': occupation,
                            }, index=[str(counter)])
                            genderedWordDf = pd.concat([genderedWordDf, wordData])
                            break
            else:
                pass
            node = node.next
        record =  pd.DataFrame({
            'index': advertisementIndex,
            'occupation': occupation,
            'masculineWordCount': masculineWordCounterPerAdvertisement,
            'feminineWordCount': feminineWordCounterPerAdvertisement,
            'wordCount': counter
        }, index=[str(advertisementIndex)])
        df = pd.concat([df, record])
        masculineWordCounterPerOccupation += masculineWordCounterPerAdvertisement
        feminineWordCounterPerOccupation += feminineWordCounterPerAdvertisement
        wordCounterPerOccupation += counter
        advertisementIndex += 1

    record =  pd.DataFrame({
        'index': occupationIndex,
        'occupation': occupation,
        'masculineWordCount' : masculineWordCounterPerOccupation,
        'feminineWordCount' : feminineWordCounterPerOccupation, 
        'wordCount' : wordCounterPerOccupation,
        'number': int(advertisementIndex - 1)
    }, index=[str(occupationIndex)])
    dfByOccupation = pd.concat([dfByOccupation, record])

df.to_csv('./result/japanese_result.csv', index=False)
dfByOccupation.to_csv('./result/japanese_result_per_occupation.csv', index=False)
genderedWordDf.to_csv('./result/japanese_gendered_words_in_advertisements.csv', index=False)