# Distributed Dictionary Representation (DDR)

Garten, J., Hoover, J., Johnson, K. M., Boghrati, R., Iskiwitch, C., & Dehghani, M. (2018). Dictionaries and distributions: Combining expert knowledge and large scale textual data content analysis: Distributed dictionary representation. Behavior research methods, 50, 344-361.

In [1]:
import os, sys
import pandas as pd

path = os.getcwd()
sys.path.insert(0, f"{path}/DDR/DDR-master")
import ddr

Download GoogleNews-vectors-negative300.bin <https://github.com/mmihaltz/word2vec-GoogleNews-vectors/tree/master>, and move it to the DDR folder before running the script

In [2]:
model, num_features, index2word_set = ddr.load_model(model_path = 'DDR/GoogleNews-vectors-negative300.bin')

2023-08-16 04:07:15,855 : INFO : loading projection weights from DDR/GoogleNews-vectors-negative300.bin
2023-08-16 04:07:35,780 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from DDR/GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-08-16T04:07:35.780461', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'load_word2vec_format'}


Finished loading model


## Run DDR model

In [3]:
dic_terms = ddr.load_terms.terms_from_txt(input_path = 'DDR/dictionaries')
agg_dic_vecs = ddr.get_vecs.dic_vecs(dic_terms = dic_terms,
                model = model,
                num_features = num_features,
                model_word_set = index2word_set)
ddr.load_terms.terms_to_csv(agg_dic_vecs, output_path = 'outputs/automated_results/Incivility&Diversity_ddr.csv', delimiter = ',')

In [4]:
agg_doc_vecs = ddr.get_vecs.doc_vecs_from_csv(input_path = 'data/test.csv',
            output_path = 'outputs/automated_results/ddr_result.csv',
            model = model,
            num_features = num_features,
            model_word_set = index2word_set,
            text_col = 'commentText',
            delimiter= ',',
            header = True,
            id_col = 'ID')


['StartDate', 'RecordedDate', 'IPAddress', 'Finished', 'Coder', 'ID', 'Mark_ID', 'Genre', 'topiccode', 'Platform', 'Anonymity', 'Anonymity_9_TEXT', 'codable', 'Interaction', 'Acknowledgement', 'TopicRelevance', 'Reasoning', 'BackgroundInfo', 'ExternalEvidence', 'ExternalEvidence_1_TEXT', 'Opinion', 'disagreement', 'Ideologicaldirection', 'Name_calling', 'Vulgarity', 'Attack_reputation', 'Question_Intelligenc', 'All_caps_function', 'Sarcasm_to_criticize', 'Individual_right', 'discrimination', 'Invoke_violence', 'Tone', 'INTERACTIVITY_DUMMY', 'RATIONALITY_DUMMY', 'HAS_OPINION_DUMMY', 'LIBERAL_NEUTRAL_CONSERVATIVE', 'LIBERAL_DUMMY', 'CONSERVATIVE_DUMMY', 'NAMECALLING_DUMMY', 'VULGAR_DUMMY', 'NAMECALLING_VULGAR_DUMMY', 'INCIVILITY_ORDINAL', 'INCIVILITY_DUMMY', 'INTOLERANCE_DUMMY', 'filter_$', 'IMPOLITENESS_DUMMY', 'commentText', 'showName', 'genre', 'Time_comment', 'likeCount_comment', 'entities', 'user_info_Twitter', 'user_id', 'place', 'retweet_count', 'platform', 'retweeted', 'language'

  avg_feature_vec = feature_vec / nwords


In [5]:
pd.read_csv('outputs/automated_results/Incivility&Diversity_ddr.csv').corr()

Unnamed: 0,Dict_Hostility_Ksiazek_2015.txt,MFD2_conservative.txt,Dict_Civility_Ksiazek_2015.txt,Dict_Incivility_Muddiman.txt,Dict_GoogeProject_OffensiveWords.txt,Dict_Swearwords_LIWC.txt,MFD1_conservative.txt,MFD1_liberal.txt,2020-04-02_HatebaseVocabEN.csv,MFD2_liberal.txt
Dict_Hostility_Ksiazek_2015.txt,1.0,0.779299,0.663929,0.783592,0.86586,0.899173,0.736892,0.666667,0.608243,0.760052
MFD2_conservative.txt,0.779299,1.0,0.66049,0.714375,0.646382,0.598701,0.932003,0.750189,0.638501,0.829174
Dict_Civility_Ksiazek_2015.txt,0.663929,0.66049,1.0,0.541031,0.560979,0.572167,0.690406,0.665192,0.667632,0.681108
Dict_Incivility_Muddiman.txt,0.783592,0.714375,0.541031,1.0,0.539499,0.57779,0.757989,0.747515,0.525744,0.76629
Dict_GoogeProject_OffensiveWords.txt,0.86586,0.646382,0.560979,0.539499,1.0,0.949099,0.545419,0.444047,0.512594,0.551155
Dict_Swearwords_LIWC.txt,0.899173,0.598701,0.572167,0.57779,0.949099,1.0,0.52391,0.441013,0.507798,0.522287
MFD1_conservative.txt,0.736892,0.932003,0.690406,0.757989,0.545419,0.52391,1.0,0.793935,0.620627,0.819691
MFD1_liberal.txt,0.666667,0.750189,0.665192,0.747515,0.444047,0.441013,0.793935,1.0,0.615312,0.880053
2020-04-02_HatebaseVocabEN.csv,0.608243,0.638501,0.667632,0.525744,0.512594,0.507798,0.620627,0.615312,1.0,0.631034
MFD2_liberal.txt,0.760052,0.829174,0.681108,0.76629,0.551155,0.522287,0.819691,0.880053,0.631034,1.0


In [6]:
num_features = 300
ddr.get_loadings( agg_doc_vecs_path='outputs/automated_results/ddr_result.csv',
                 agg_dic_vecs_path='outputs/automated_results/Incivility&Diversity_ddr.csv',
                 out_path='outputs/automated_results/ddr_similarity.csv',
                 num_features=num_features,
                 delimiter = ',' )

Percent: [##########] 97.28331177231566%  Failed to calculate 17 loadings due to missing values.
IDs for documents with missing values:

 ['UgxkUT3_s0uuwTQqBzt4AaABAg', 'UgyoBfV6rTg1SVYCXHp4AaABAg.8voprNsQx1_8voq4kNW6ao', 'Ugj9AC9DkzSW23gCoAEC.7-H0Z7-RgnU7-H1adLhYvH', 'Ugiddu2cd17RhHgCoAEC', 'UgxyFqIgmY4LrzPFkqB4AaABAg', 'UgwudzFskyOUx6io1y14AaABAg', 'UgyCUpQV3uOnxksrT7V4AaABAg.8sC9R4S_sAu8sCAoRN4vSB', '1175070616472470000', '1177208417213210000', '1176277960988530000', '1119429467166970000', '1187518291499200000', '1144790875463210000', 'UgjbZ-5j3DDFUngCoAEC.8HbxmNs46fn8HcPHwSYDkP', 'Ugxl9LogfcJR6_Kf6o94AaABAg', 'UgxQTFuRGd3oqHnGOEV4AaABAg', 'UgxeMleFdE5yT5uc_OB4AaABAg.8qLukX7LinM8qPLi9Mw_sJ']


In [7]:
Incivility = ['HATELIST_FOCUSED_DUMMY', 'INCIVILITY_DUMMY']
Diversity = ['HAS_OPINION_DUMMY','LIBERAL_DUMMY','CONSERVATIVE_DUMMY']

In [8]:
manual_coding = pd.read_csv('data/test.csv')
similarity = pd.read_csv('outputs/automated_results/ddr_similarity.csv')
similarity = similarity.merge(manual_coding[['ID','commentText']+Incivility+Diversity], on='ID', how='left')
similarity.to_csv('outputs/automated_results/ddr_merged_similarity.csv',index=False)
similarity

Unnamed: 0,ID,Dict_Hostility_Ksiazek_2015.txt,MFD2_conservative.txt,Dict_Civility_Ksiazek_2015.txt,Dict_Incivility_Muddiman.txt,Dict_GoogeProject_OffensiveWords.txt,Dict_Swearwords_LIWC.txt,MFD1_conservative.txt,MFD1_liberal.txt,2020-04-02_HatebaseVocabEN.csv,MFD2_liberal.txt,commentText,HATELIST_FOCUSED_DUMMY,INCIVILITY_DUMMY,HAS_OPINION_DUMMY,LIBERAL_DUMMY,CONSERVATIVE_DUMMY
0,Ugx2WXq9UdV8mPPjejJ4AaABAg.8yHCKV0Boe58yYRxEQEF45,0.582558,0.552389,0.672019,0.540005,0.482691,0.509313,0.552224,0.595486,0.716762,0.569487,That's a vicious insult!!! What did a box of r...,0,1,0,0,0
1,UgwUPFScjJ0MCeaP2F54AaABAg.8lvp3fc9Euf8lvvgsUgEgV,0.499794,0.512552,0.653917,0.453743,0.384375,0.392038,0.506694,0.509908,0.715731,0.542373,Goya Solidar. So there are a few of us left. ...,0,0,0,0,0
2,UgwRZv_F4VDm_-Z-f6l4AaABAg,0.539589,0.440574,0.447391,0.469250,0.422268,0.475109,0.474650,0.422857,0.452075,0.467921,Trump is a Traitor! Lock that coward up!,1,1,1,1,0
3,UgyCwfMxdJiV_HEQR754AaABAg,0.588588,0.564070,0.682493,0.481848,0.476167,0.495299,0.545204,0.586969,0.791690,0.600203,america a country of idiots ran by the nra its...,1,1,1,1,0
4,UgxjeqM4xT2JgZHv4Mh4AaABAg,0.606171,0.455719,0.668502,0.427126,0.555055,0.628139,0.428654,0.444361,0.622406,0.461243,How sad are you guys gonna be when you still l...,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,Ugwu_8FyrPOPsiUFOwJ4AaABAg.8v37B9a3NuP8vXbaTvXU6l,0.586742,0.477756,0.713276,0.462311,0.507896,0.563385,0.477141,0.454090,0.660786,0.480637,@WanderfalkeAT I don't think they run a conspi...,0,0,1,0,0
752,UgzAKozZbWD42SwHjpN4AaABAg.8f-pKP_hEMZ8f1GHmo0Swp,0.589071,0.490093,0.689943,0.427199,0.490643,0.536689,0.487290,0.470565,0.652202,0.492991,usually schadenfreude isn't a good thing to fe...,0,0,1,1,0
753,UgzM7j1HgzgvgtjeWVN4AaABAg.8hTpR_HFok08hUod9kdzkz,0.778621,0.495752,0.487112,0.495029,0.783421,0.870256,0.439611,0.360962,0.459861,0.422452,you're a fucking moron,1,1,0,0,0
754,UgzYBOtnvf4HdbtbDol4AaABAg.8wRfggpcK3y8wZrYsJVJtQ,0.453290,0.433955,0.575924,0.357956,0.349724,0.375992,0.436485,0.429068,0.726150,0.478631,He used to be the white guy at baseball games ...,0,0,0,0,0


## Correlation

In [9]:
from scipy.stats import pearsonr
import numpy as np

In [10]:
rho = similarity.corr()
pval = similarity.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [0.001,0.01,0.05] if x<=t]))
matrix = rho.round(2).astype(str) + p
display(matrix)

Unnamed: 0,Dict_Hostility_Ksiazek_2015.txt,MFD2_conservative.txt,Dict_Civility_Ksiazek_2015.txt,Dict_Incivility_Muddiman.txt,Dict_GoogeProject_OffensiveWords.txt,Dict_Swearwords_LIWC.txt,MFD1_conservative.txt,MFD1_liberal.txt,2020-04-02_HatebaseVocabEN.csv,MFD2_liberal.txt,HATELIST_FOCUSED_DUMMY,INCIVILITY_DUMMY,HAS_OPINION_DUMMY,LIBERAL_DUMMY,CONSERVATIVE_DUMMY
Dict_Hostility_Ksiazek_2015.txt,1.0***,0.84***,0.79***,0.88***,0.91***,0.92***,0.8***,0.71***,0.64***,0.78***,0.23***,0.29***,0.23***,0.15***,0.08*
MFD2_conservative.txt,0.84***,1.0***,0.82***,0.91***,0.61***,0.6***,0.98***,0.92***,0.84***,0.96***,0.19***,0.27***,0.33***,0.16***,0.11**
Dict_Civility_Ksiazek_2015.txt,0.79***,0.82***,1.0***,0.78***,0.6***,0.62***,0.84***,0.84***,0.87***,0.85***,0.06,0.1**,0.21***,0.1**,0.08*
Dict_Incivility_Muddiman.txt,0.88***,0.91***,0.78***,1.0***,0.65***,0.67***,0.92***,0.88***,0.74***,0.9***,0.18***,0.3***,0.35***,0.18***,0.14***
Dict_GoogeProject_OffensiveWords.txt,0.91***,0.61***,0.6***,0.65***,1.0***,0.99***,0.52***,0.41***,0.38***,0.5***,0.23***,0.26***,0.09*,0.08*,0.04
Dict_Swearwords_LIWC.txt,0.92***,0.6***,0.62***,0.67***,0.99***,1.0***,0.52***,0.42***,0.39***,0.5***,0.22***,0.25***,0.1**,0.09*,0.05
MFD1_conservative.txt,0.8***,0.98***,0.84***,0.92***,0.52***,0.52***,1.0***,0.95***,0.85***,0.97***,0.17***,0.24***,0.35***,0.17***,0.12***
MFD1_liberal.txt,0.71***,0.92***,0.84***,0.88***,0.41***,0.42***,0.95***,1.0***,0.89***,0.97***,0.11**,0.21***,0.36***,0.14***,0.12***
2020-04-02_HatebaseVocabEN.csv,0.64***,0.84***,0.87***,0.74***,0.38***,0.39***,0.85***,0.89***,1.0***,0.88***,0.1**,0.16***,0.27***,0.1**,0.08*
MFD2_liberal.txt,0.78***,0.96***,0.85***,0.9***,0.5***,0.5***,0.97***,0.97***,0.88***,1.0***,0.15***,0.25***,0.34***,0.15***,0.13***
