In [1]:
import pandas as pd
import numpy as np
import nltk
import regex
from nltk.stem import PorterStemmer
ps = PorterStemmer() 

## Define Utility classes and meta info

In [2]:
class MyTokenizer:
    def tokenize(self, text):
        tokenizer = nltk.tokenize.TreebankWordTokenizer()
        result = []
        word = r"\p{letter}"
        for sent in nltk.sent_tokenize(text):
            tokens = tokenizer.tokenize(sent)    
            tokens = [t.lower() for t in tokens 
                      if regex.search(word, t)]
            result += tokens
        return result

mytokenizer = MyTokenizer()

In [3]:
class MyDictionaryMapper():
    
    def __init__(self, dictionaries, raw_text, diversity=None):
        self.dictionaries = dictionaries
        self.dictionaries_stemmed = [self.get_stemmed_dictionary(e) for e in self.dictionaries]

        self.raw_text = raw_text
        self.tokenized = [mytokenizer.tokenize(d) for d in self.raw_text if d is not np.nan]
        self.tokenized_stemmed = [[ps.stem(i) for i in x] for x in self.tokenized if x is not np.nan]
        self.dict_names = [ 'Dict_Hostility_Ksiazek_2015',
                'Dict_Civility_Ksiazek_2015', 
                'Dict_GoogeProject_OffensiveWords', 
                'Dict_Incivility_Muddiman',
                'Dict_Swearwords_LIWC',
                'Dict_HatebaseVocabEN',
                'MFD1_conservative', 
                'MFD1_liberal',
                'MFD2_conservative', 
                'MFD2_liberal']
        self.diversity = diversity



    def get_stemmed_dictionary(self, e):
        return (e[0], [ps.stem(x) for x in e[1]])

    def return_token_counts(self, text, d):
        return sum([text.count(f) for f in d if isinstance(text, str)] ) 

    def count_tokens(self):

        ''' returns counts based on simple string method .count; probably least sophisticated approach. '''

        simple_counts = []
        for d in self.dictionaries:
            simple_counts.append([ self.return_token_counts(text, d[1]) for text in self.raw_text])
        return simple_counts

    def apply_dictionary(self, tokens, dictionary):
        hits = [ w for w in tokens if w.lower() in dictionary ]
        return hits

    def get_matches(self):

        matches = [[self.apply_dictionary(tokens, d[1] ) for tokens in self.tokenized ] for d in self.dictionaries ]
        len_matches = [[len(self.apply_dictionary(tokens, d[1])) for tokens in self.tokenized ] for d in self.dictionaries ]
        ratio_matches = [[self.zero_div(len(self.apply_dictionary(tokens, d[1])),len(tokens)) for tokens in self.tokenized ] for d in self.dictionaries ]
        
        return matches, len_matches, ratio_matches
    
    def zero_div(self,x,y):
        if y == 0:
            return 0
        else:
            return x/y
        
    def get_matches_stemmed(self):

        matches =  [[self.apply_dictionary(tokens, d[1] ) for tokens in self.tokenized_stemmed ] for d in self.dictionaries_stemmed ]
        len_matches = [[len(self.apply_dictionary(tokens, d[1])) for tokens in self.tokenized_stemmed ] for d in self.dictionaries_stemmed ]
        ratio_matches = [[self.zero_div(len(self.apply_dictionary(tokens, d[1])),len(tokens)) for tokens in self.tokenized_stemmed ] for d in self.dictionaries_stemmed ]
        
        return matches, len_matches, ratio_matches

    def presence_concept(self, x):
    
        '''consider concept presence if keywords occur more than N times. WE MAY HAVE TO CHECK WHICH N WORKS BEST!'''
        
        N = 0

        if x > N:
            return 1 
        else:
            return 0

    def dummy_code_results(self, type="notstemmed"):

        if type=="stemmed":
            matches, len_matches, _ = self.get_matches_stemmed()

        elif type =="notstemmed":
            matches, len_matches, _ = self.get_matches()

        dummy_presence = [[self.presence_concept(x) for x in matches] for matches in len_matches ]

        return dummy_presence



In [4]:
path_to_dicts = 'data/dictionaries/'

incivil_dicts = [ 'Dict_Hostility_Ksiazek_2015.txt',
                'Dict_Civility_Ksiazek_2015.txt', 
                'Dict_GoogeProject_OffensiveWords.txt', 
                'Dict_Incivility_Muddiman.txt',
                'Dict_Swearwords_LIWC.txt',
                '2020-04-02_HatebaseVocabEN.csv'] 

dict_set = []

for d in incivil_dicts[:-1]:
    with open(f'{path_to_dicts}{d}', mode = 'r') as fi:
        data = [line.strip() for line in fi]
        fname = d.split('.')[0]
        dict_set.append((fname,data))
temp = pd.read_csv(f'data/dictionaries/2020-04-02_HatebaseVocabEN.csv',sep=';')
HB = [i for i in temp['term'] if str(i) != 'nan']
dict_set.append((incivil_dicts[-1].split('.')[0],HB))

In [5]:
def read_MFD(fn = 'Dict_MFD_02.txt'):
    with open('data/dictionaries/'+fn,'r') as fi:
        lines = fi.read()
    MFD_conservative = []
    MFD_liberal = []
    rows = lines.split('\n')
    for i in range(len(rows)):
        row = rows[i].split('\t')
        for j in range(1, min(len(row),4)): #only read the first three dimension
            if row[j]:
                if int(row[j]) <= 4:
                    MFD_liberal.append(row[0])
                elif int(row[j]) > 4:
                    MFD_conservative.append(row[0])
    return MFD_conservative, MFD_liberal

In [6]:
MFD1_conservative, MFD1_liberal = read_MFD('Dict_MFD_01.txt')
MFD2_conservative, MFD2_liberal = read_MFD('Dict_MFD_02.txt')

In [7]:
file = open('data/dictionaries/MFD2_liberal.txt','w')
for item in MFD2_liberal:
    file.write(item+"\n")
file.close()

In [8]:
MFD1_conservative, MFD1_liberal = read_MFD('Dict_MFD_01.txt')
MFD2_conservative, MFD2_liberal = read_MFD('Dict_MFD_02.txt')
dict_set.extend([('MFD1_conservative', MFD1_conservative),('MFD1_liberal', MFD1_liberal),
                ('MFD2_conservative', MFD2_conservative),('MFD2_liberal', MFD2_liberal)])

## Initilize MyDictionaryMapper and run the dictionaries

In [9]:
def get_results(file_name):
    select = pd.read_csv(file_name)[['ID','commentText']]
    text_clean = select['commentText']
    text_clean[0:5]
    
    #tokens = [mytokenizer.tokenize(d) for d in text_clean if d is not np.nan]

    mydictionarymapper = MyDictionaryMapper(dict_set, text_clean)
    #get_simple_counts = mydictionarymapper.count_tokens()

    get_matches, _, ratio_matches = mydictionarymapper.get_matches_stemmed()
    get_dummy_presence_stemmed = mydictionarymapper.dummy_code_results(type="stemmed") # or switch to "not stemmed"
    get_dummy_presence_unstemmed = mydictionarymapper.dummy_code_results(type ="notstemmed")
    
    stemmed_dummy_variables = [ 'Dict_Hostility_Ksiazek_2015', 'Dict_Civility_Ksiazek_2015', 'Dict_GoogeProject_OffensiveWords', 
                'Dict_Incivility_Muddiman', 'Dict_Swearwords_LIWC', 'Dict_HatebaseVocabEN','MFD1_conservative', 'MFD1_liberal', 'MFD2_conservative', 'MFD2_liberal']
    stemmed_ratio_variables = ['MFD1_conservative_ratio', 'MFD1_liberal_ratio', 'MFD2_conservative_ratio', 'MFD2_liberal_ratio']
    unstemmed_dummy_variables = []
    
    #incivility / hostility / civility are stemmed
    #LIWC / Google Project are unstemmed 

    dummy_stemmed = pd.DataFrame(get_dummy_presence_stemmed).T
    dummy_stemmed.columns = mydictionarymapper.dict_names
    dummy_stemmed = dummy_stemmed[stemmed_dummy_variables]
    assert len(dummy_stemmed) == len(select) ## make sure these are of equal length

    display(dummy_stemmed.head(5))


    ratio_stemmed = pd.DataFrame(ratio_matches).T#[stemmed_ratio_variables]
    ratio_stemmed.columns = [name+'_ratio' for name in mydictionarymapper.dict_names]
    ratio_stemmed = ratio_stemmed[stemmed_ratio_variables]
    assert len(ratio_stemmed) == len(select) ## make sure these are of equal length

    display(ratio_stemmed.head(5))
    
    #(Re)Merge stemmed / unstemmed dicts results 
    dict_merge = pd.concat([dummy_stemmed, ratio_stemmed], axis = 1)

    #merge with main dataset
    df = pd.concat([dict_merge, select], axis = 1)

    df = df.rename(columns = {'Civility_dummy':'Civility_dummy_ZeroS',
                        'Rationality_dummy':'Rationality_dummy_ZeroS',
                        'Ideology_dummy':'Ideology_dummy_ZeroS',
                        'Interactivity_dummy':'Interactivity_dummy_ZeroS'})
    return df

In [10]:
results = get_results('data/full_data.csv')
results.to_csv('outputs/automated_results/Incivility&Diversity.csv',index=False)

Unnamed: 0,Dict_Hostility_Ksiazek_2015,Dict_Civility_Ksiazek_2015,Dict_GoogeProject_OffensiveWords,Dict_Incivility_Muddiman,Dict_Swearwords_LIWC,Dict_HatebaseVocabEN,MFD1_conservative,MFD1_liberal,MFD2_conservative,MFD2_liberal
0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,1,1,1
2,0,1,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,1,0,1,0
4,0,1,0,0,0,0,0,0,1,1


Unnamed: 0,MFD1_conservative_ratio,MFD1_liberal_ratio,MFD2_conservative_ratio,MFD2_liberal_ratio
0,0.0,0.0,0.0,0.0
1,0.0,0.060606,0.030303,0.060606
2,0.0,0.0,0.0,0.0
3,0.027778,0.0,0.027778,0.0
4,0.0,0.0,0.142857,0.142857
