In [40]:
import os
import pandas as pd
import string
from string import punctuation
import zhon.hanzi
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load texts

In [547]:
luxun_directory = '../Final Project/Call to Arms'
luxun_files = os.listdir(luxun_directory)

luxun_corpus = []

for filename in luxun_files:
    full_path = os.path.join(luxun_directory,filename)
    with open(full_path,'r') as file_in:
        text_string = file_in.read()
    luxun_corpus.append(text_string)

In [10]:
luyin_directory = '../Final Project/Luyin Corpus'
luyin_files = os.listdir(luyin_directory)


luyin_corpus = []

for filename in luyin_files:
    full_path = os.path.join(luyin_directory,filename)
    with open(full_path,'r') as file_in:
        text_string = file_in.read()
    luyin_corpus.append(text_string)

## Preprocess and split into sentences

In [4]:
punctuation_list = punctuation + zhon.hanzi.punctuation
punctuation_list

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·．！？｡。'

In [5]:
luxun_sentence_list = []
for document in luxun_corpus:
    document = document.lower().replace("\n","")
    sentences = document.split("。")
    luxun_sentence_list = luxun_sentence_list + sentences

In [11]:
luyin_sentence_list = []
for document in luyin_corpus:
    document = document.lower().replace("\n","")
    sentences = document.split("。")
    luyin_sentence_list = luyin_sentence_list + sentences

## Inspect sentence lists

In [21]:
luxun_sentence_list[:5]

['俄國的盲詩人愛羅先珂君帶了他那六弦琴到北京之後不久，便向我訴苦說：「寂寞呀，寂寞呀，在沙漠上似的寂寞呀！」這應該是真實的，但在我卻未曾感得；我住得久了，「入芝蘭之室，久而不聞其香」，只以為很是嚷嚷罷了',
 '然而我之所謂嚷嚷，或者也就是他之所謂寂寞罷',
 '我可是覺得在北京彷彿沒有春和秋',
 '老於北京的人說，地氣北轉了，這裡在先是沒有這麼和暖',
 '只是我總以為沒有春和秋；冬末和夏初銜接起來，夏才去，冬又開始了']

In [22]:
luyin_sentence_list[:5]

['震動全上海市的炮聲，在天色黎明的時候又從新開始了',
 '一種恐怖和不知所措的情緒，正通過每一個人的心，尤其是那一雙抛家失業飄泊在上海的女兒，她們簡直連一分鍾都不能勉強鎮靜了',
 '她們睜開惺松的而帶惶惑的眼睛，向她們所借住的朋友的客堂間，默察了以後，那個身材瘦弱名叫畏如的轉過面孔長歎了一聲，兩顆亮晶晶的眼淚滴在枕上了',
 '她的同伴星若是一個肌肉豐潤的女郎，這正是兩個相反的人型而她們發生了愛情，已經共同生活了五六年',
 '這時星若溫柔的撫弄著畏如垂在枕下的絲發故意的歡笑道：“你這個傻瓜，又在發什麽神經病！”畏如哽咽著道：“不是喲！哦，我那裏發什麽神經病，我真的是感著痛心！……”“有什麽可痛心的，日本人的大炮使你痛心嗎？那也不只你一個人呵！”“你不要故意的氣我了，聽我告訴你，世界上的人都壞透了，尤其是那些男人，從前那樣熱烈的追逐著，懇求著，而到現在緊急的時候便想求他們幫幫忙就沒有一個人肯理睬了，你想怎麽不叫人傷心！”“從前是從前，現在是現在，……”星若說']

## Write a tokenizer function

In [30]:
def tokenize(text):
    for char in punctuation_list:
        text = text.replace(char, "")
    tokens = [char for char in text]
    return tokens

## Tokenize sentences

In [24]:
luxun_cv = CountVectorizer(tokenizer=tokenize)
luyin_cv = CountVectorizer(tokenizer=tokenize)

## Create a Sentence-Term Matrix

In [27]:
luxun_stm = luxun_cv.fit_transform(luxun_sentence_list).toarray()
luyin_stm = luyin_cv.fit_transform(luyin_sentence_list).toarray()



## Save in a human-readable format

In [31]:
luxun_df = pd.DataFrame(luxun_stm,
                            columns = luxun_cv.get_feature_names_out())
luyin_df = pd.DataFrame(luyin_stm,
                            columns = luyin_cv.get_feature_names_out())

## Inspect

In [32]:
luxun_df

Unnamed: 0,Unnamed: 1,d,e,i,k,n,o,q,s,u,...,鼕,鼻,鼾,齊,齒,齡,齣,龍,龔,龜
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1767,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1768,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1769,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1770,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
luyin_df

Unnamed: 0,一,七,丈,三,上,下,不,且,世,丟,...,黨,鼓,鼻,齊,齋,齡,齦,龍,龐,１
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,1,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,1,1,4,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,5,0,0,0,2,0,10,2,0,0,...,1,0,0,0,0,0,0,0,0,0
685,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
686,1,0,0,0,2,1,8,0,2,0,...,0,0,0,0,0,0,0,0,0,0
687,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Create a Term-Sentence Matrix

In [243]:
luxun_tdm = luxun_df.T
luxun_tdm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1762,1763,1764,1765,1766,1767,1768,1769,1770,1771
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
i,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
k,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
齡,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
齣,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
龍,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
龔,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [308]:
luyin_tdm = luyin_df.T
luyin_tdm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,679,680,681,682,683,684,685,686,687,688
一,0,4,1,1,2,0,0,1,0,0,...,0,0,0,0,3,5,0,1,0,0
七,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
丈,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
三,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
上,1,1,1,0,1,0,0,0,0,0,...,2,0,0,0,4,2,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
齡,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
齦,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
龍,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
龐,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Create parameters for query

In [552]:
query_dictionary = {'女': 'woman 女',
                    '她': 'she 她',
                    '男': 'man 男',
                    '他': 'he 他',
                   }

subset_word_dictionary = {'國':'country 國',
                          '家':'family 家',
                          '新':'new 新',
                          '舊':'old 舊',
                          '母':'mother 母',
                          '父':'father 父',
                          '孩':'child 孩',
                          '嫁':'marriage (women) 嫁',
                          '婚':'marriage (not gendered) 婚',
                          '獨':'independence 獨',
                          '進':'progress 進',
                          '愛':'love 愛',
                          '情':'emotion 情',
                          '社':'society 社',
                          '女': 'woman 女',
                          '男': 'man 男',
                          '她': 'she 她',
                          '他': 'he 他',
                        }

query_list = list(query_dictionary.keys())
subset_word_list = list(subset_word_dictionary.keys())

## Write functions to query linguistic bias

In [553]:
def luxun_bias(query_list, subset_word_list):
    
    # creating a clause to remove words not in the matrix
    for item in subset_word_list:
        if item not in luxun_tdm.index:
            subset_word_list.remove(item)
    for item in query_list:
        if item not in luxun_tdm.index:
            query_list.remove(item)
    # creating an empty dataframe
    df = pd.DataFrame(0,index=subset_word_list, columns=query_list)
    
    # writing a for loop for each query term
    for item in query_list:
        query_string = item
        
        # calculating cosine similarity
        query_vector = luxun_tdm.loc[query_string].values.reshape(1, -1)
        subset_vectors = luxun_tdm.loc[subset_word_list].values
        query_sim_array = cosine_similarity(query_vector, subset_vectors).flatten()
        
        # updating dataframe
        df[item] = query_sim_array
     
    # sorting the dataframe by subset word frequency for '女' (woman)
    df = df.sort_values(by='女', ascending=False)
    
    # renaming columns for English readers
    new_df = df.rename(query_dictionary, axis='columns').rename(subset_word_dictionary, axis='index')
   
    return new_df   

def luyin_bias(query_list, subset_word_list):
    for item in subset_word_list:
        if item not in luxun_tdm.index:
            subset_word_list.remove(item)
    df = pd.DataFrame(0,index=subset_word_list, columns=query_list)
    for item in query_list:
        query_string = item
        query_vector = luyin_tdm.loc[query_string].values.reshape(1, -1)
        subset_vectors = luyin_tdm.loc[subset_word_list].values
        query_sim_array = cosine_similarity(query_vector, subset_vectors).flatten()
        df[item] = query_sim_array
    df = df.sort_values(by='女', ascending=False)
    new_df = df.rename(query_dictionary, axis='columns').rename(subset_word_dictionary, axis='index')
    return new_df

In [554]:
query_list = list(query_dictionary.keys())
subset_word_list = list(subset_word_dictionary.keys())  

luxun_bias(query_list, subset_word_list)

Unnamed: 0,woman 女,man 男,he 他
woman 女,1.0,0.293151,0.109705
man 男,0.293151,1.0,0.057174
marriage (women) 嫁,0.226134,0.0,0.0
he 他,0.109705,0.057174,1.0
family 家,0.101748,0.029827,0.189635
country 國,0.099716,0.033408,0.070289
emotion 情,0.098693,0.077152,0.112923
old 舊,0.094222,0.0,0.076813
child 孩,0.079333,0.031009,0.087935
progress 進,0.048656,0.022822,0.146138


In [555]:
query_list = list(query_dictionary.keys())
subset_word_list = list(subset_word_dictionary.keys())  

luyin_bias(query_list, subset_word_list)

Unnamed: 0,woman 女,she 她,man 男,he 他
woman 女,1.0,0.24258,0.416403,0.320192
man 男,0.416403,0.187771,1.0,0.322271
he 他,0.320192,0.218635,0.322271,1.0
love 愛,0.254417,0.273344,0.341092,0.326891
family 家,0.182829,0.317749,0.215538,0.174472
child 孩,0.164226,0.162137,0.02841,0.101284
society 社,0.125709,0.093447,0.243561,0.192961
country 國,0.113455,0.079067,0.0,0.0
progress 進,0.110961,0.144992,0.0,0.091245
father 父,0.110224,0.179236,0.050847,0.16785


## Write a function to explore terms associated with subjectivity

In [558]:
def subjectivity_calculator(subset_word_list):
    for item in subset_word_list:
        if item not in luxun_tdm.index:
            subset_word_list.remove(item)
    df = pd.DataFrame(0,index=subset_word_list, columns=['Lu Yin 廬隱','Lu Xun 魯迅'])
    query_string = '我'
    luyin_query_vector = luyin_tdm.loc[query_string].values.reshape(1, -1)
    luyin_subset_vectors = luyin_tdm.loc[subset_word_list].values
    luyin_query_sim_array = cosine_similarity(luyin_query_vector, luyin_subset_vectors).flatten()
    df['Lu Yin 廬隱'] = luyin_query_sim_array
    
    luxun_query_vector = luxun_tdm.loc[query_string].values.reshape(1, -1)
    luxun_subset_vectors = luxun_tdm.loc[subset_word_list].values
    luxun_query_sim_array = cosine_similarity(luxun_query_vector, luxun_subset_vectors).flatten()
    df['Lu Xun 魯迅'] = luxun_query_sim_array
        
    df = df.sort_values(by='Lu Yin 廬隱', ascending=False)
    new_df = df.rename(subset_word_dictionary, axis='index')
    return new_df

In [559]:
subjectivity_calculator(subset_word_list)

Unnamed: 0,Lu Yin 廬隱,Lu Xun 魯迅
he 他,0.443909,0.262123
love 愛,0.342744,0.041774
emotion 情,0.292135,0.072516
man 男,0.29007,0.019582
family 家,0.2723,0.169769
woman 女,0.232218,0.058447
progress 進,0.231612,0.114403
society 社,0.222768,0.019736
child 孩,0.188885,0.064768
mother 母,0.175377,0.165313
