In [2]:
import os
import gzip
import multiprocessing
import pickle
import numpy as np
from wordcloud import WordCloud
from collections import Counter
import nltk 
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import time

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

In [3]:
comments_corpus = pickle.load(gzip.open("../../Results/comments_corpus_alt.pickle.gz","rb"))

In [4]:
import string
words_frequency = {}

for subreddit in comments_corpus:
    words_frequency[subreddit] = Counter()
    print("Working on {} subreddit".format(subreddit))
    print("The subreddit contains {} comments".format(len(comments_corpus[subreddit])))
    for key,comment in enumerate(comments_corpus[subreddit]):
        for word in comment.replace('\n',' ').split(' '):
            filtered_word = word.lower().split("'")[0].translate(str.maketrans('', '', string.punctuation))
            words_frequency[subreddit][filtered_word] += 1
        if key % 100000 == 0:
            print(time.time())
            print("{} comments finished for {}".format(key,subreddit))

Working on The_Farage subreddit
The subreddit contains 24191 comments
1554622479.5766973
0 comments finished for The_Farage
Working on Le_Pen subreddit
The subreddit contains 31256 comments
1554622480.8347533
0 comments finished for Le_Pen
Working on altright subreddit
The subreddit contains 166436 comments
1554622482.4315522
0 comments finished for altright
1554622488.933078
100000 comments finished for altright
Working on progressive subreddit
The subreddit contains 87694 comments
1554622492.7311969
0 comments finished for progressive
Working on Conservative subreddit
The subreddit contains 1732239 comments
1554622499.3372216
0 comments finished for Conservative
1554622505.4718003
100000 comments finished for Conservative
1554622512.0654147
200000 comments finished for Conservative
1554622519.2255223
300000 comments finished for Conservative
1554622526.6080554
400000 comments finished for Conservative
1554622533.6017041
500000 comments finished for Conservative
1554622541.3295531
600

In [5]:
pickle_out = open('../../Results/words_frequency_alt.pickle','wb')
pickle.dump(words_frequency,pickle_out)
pickle_out.close()

In [6]:
words_frequency_lemmatized = {}
words_frequency_no_stopwords = {}


stop_words = set(stopwords.words('english'))
lem = WordNetLemmatizer()

for subreddit in comments_corpus:
    words_frequency_lemmatized[subreddit] = Counter()
    words_frequency_no_stopwords[subreddit] = Counter()
    print("Working on {} subreddit".format(subreddit))
    print("The subreddit contains {} unique words".format(len(words_frequency[subreddit])))
    for key,word in enumerate(words_frequency[subreddit]):
        if word not in stop_words:
            word_lem = lem.lemmatize(word,"v")
            words_frequency_no_stopwords[subreddit][word] = words_frequency[subreddit][word]
            words_frequency_lemmatized[subreddit][word_lem] += words_frequency[subreddit][word]
            if key % 1000000 == 0:
                print("{} words finished for {}".format(key,subreddit))
                print(time.time())

Working on The_Farage subreddit
The subreddit contains 34159 unique words
0 words finished for The_Farage
1554624043.0816793
Working on Le_Pen subreddit
The subreddit contains 41118 unique words
0 words finished for Le_Pen
1554624043.2162862
Working on altright subreddit
The subreddit contains 98552 unique words
0 words finished for altright
1554624043.374463
Working on progressive subreddit
The subreddit contains 66692 unique words
Working on Conservative subreddit
The subreddit contains 405881 unique words
Working on ukpolitics subreddit
The subreddit contains 1054448 unique words
1000000 words finished for ukpolitics
1554624049.5962791
Working on LateStageCapitalism subreddit
The subreddit contains 367570 unique words
Working on Libertarian subreddit
The subreddit contains 641981 unique words


In [7]:
for subreddit in words_frequency_lemmatized:
    print(subreddit)
    print(words_frequency_lemmatized[subreddit].most_common(250))

The_Farage
[('', 29024), ('people', 2933), ('get', 2503), ('like', 2362), ('say', 2214), ('would', 2015), ('eu', 1947), ('go', 1933), ('think', 1837), ('make', 1811), ('vote', 1738), ('leave', 1662), ('want', 1518), ('one', 1469), ('fuck', 1465), ('delete', 1432), ('uk', 1367), ('farage', 1321), ('know', 1286), ('see', 1265), ('right', 1246), ('even', 1185), ('take', 1158), ('us', 1157), ('good', 1122), ('brexit', 1108), ('need', 1103), ('gt', 1015), ('come', 991), ('time', 978), ('country', 913), ('look', 891), ('could', 877), ('also', 851), ('well', 817), ('back', 798), ('much', 789), ('work', 784), ('way', 771), ('party', 761), ('trump', 744), ('give', 744), ('desu', 744), ('british', 735), ('mean', 722), ('really', 721), ('use', 695), ('still', 681), ('try', 679), ('happen', 658), ('call', 657), ('let', 648), ('britain', 639), ('guy', 632), ('shit', 616), ('years', 611), ('tell', 596), ('europe', 593), ('may', 589), ('world', 581), ('never', 579), ('ukip', 574), ('many', 573), ('po

[('', 9110117), ('people', 1183494), ('would', 927708), ('say', 828657), ('think', 803814), ('get', 777519), ('like', 712677), ('go', 680298), ('make', 669496), ('gt', 612048), ('eu', 607714), ('one', 550328), ('vote', 514052), ('want', 487581), ('uk', 470009), ('leave', 456340), ('see', 427035), ('know', 417391), ('right', 405753), ('even', 399162), ('time', 387097), ('work', 377153), ('mean', 364045), ('point', 355813), ('need', 346161), ('take', 344763), ('much', 332917), ('labour', 328626), ('also', 325946), ('really', 319274), ('could', 314603), ('well', 312727), ('brexit', 312193), ('use', 308077), ('party', 305445), ('government', 304423), ('give', 302201), ('way', 294833), ('us', 292469), ('good', 290319), ('delete', 274665), ('come', 273077), ('still', 261748), ('look', 252960), ('country', 248778), ('actually', 245116), ('pay', 240497), ('may', 232218), ('try', 230912), ('seem', 228406), ('years', 227566), ('fuck', 223297), ('many', 223286), ('things', 214953), ('though', 208

In [8]:
remove = ['','permittedhttpswwwredditcomrpoliticswikirulesandregswikinopersonalattacks',\
        'discussionhttpswwwredditcomrpoliticswikirulesandregswikipleasebecivil' \
        'httpimgurcom0cf3yty', 'subredditmessagecomposetorpolitics', \
        'discussionhttpswwwredditcomrpoliticswikirulesandregswikipleasebecivil', 'delete',\
        "httpimgurcom0cf3yty", "╚═███═╝","つ","◕◕","ubotsbyliam", \
        "rbotsbyliam","༽つ","subredditmessagecomposetorthedonald", \
        "༼","formhttpbotsbyliamcom", "utonysesek556",'͡°',"energyhttpswwwyoutubecomwatchvqligom24qqc"\
        "formhttpsdocsgooglecomformsd1hdofbq85c6qhks8ydg0le6uiffbfj5sabm6qtp3zavqviewformresponses", \
        "lightyears\r","httpiimgurcomy2t4j8ajpg","pagehttpsawswariomissedcomthewallgrows", \
        "suggestionshttpsdocsgooglecomformsd1nbt77un01wxb3opvwc8llesd1e8jqj5mnkhitd4qqqkviewform", \
        "httpwwwarchiveis","pepehttpsslimgufbmzs", "brigadinghttparchiveisi4col", \
         "moderatorshttpswwwredditcommessagecomposetorpoliticsampsubjectquestion", "im","gt",\
          "etc", "1", "2", "3", "4","5","6","7","8","9","10",
          "subredditmessagecomposetorneutralpolitics","ruleshttpwwwredditcomrneutralpoliticswikiguidelineswikis", \
          "guidelineshttpswwwredditcomrsandersforpresidentwikirules","rsandersforpresident", \
          "alonehttpswwwredditcomrmetarepublicancomments5t017athissubisforrepublicansifyoudonot", \
          "subredditmessagecomposetordemocrats"
         ]

for r in remove:
    for subreddit in comments_corpus:
        del words_frequency_no_stopwords[subreddit][r]
        del words_frequency_lemmatized[subreddit][r]

In [13]:
pickle_out = open('../../Results/com_word_freq_no_stopwords_alt.pickle','wb')
pickle.dump(words_frequency_no_stopwords,pickle_out)
pickle_out.close()                

pickle_out = open('../../Results/com_word_freq_alt.pickle','wb')
pickle.dump(words_frequency_lemmatized,pickle_out)
pickle_out.close()

In [10]:
words_count_total = {}
word_percentages = {}

for subreddit in words_frequency_lemmatized:
    words_count_total[subreddit] = sum(words_frequency_lemmatized[subreddit].values())
    word_percentages[subreddit] = Counter()
    for word in words_frequency_lemmatized[subreddit]:
        word_percentages[subreddit][word] = (100 * words_frequency_lemmatized[subreddit][word])/ float(words_count_total[subreddit])

In [12]:
for subreddit in word_percentages:
    print(subreddit)
    print(word_percentages[subreddit].most_common(250))

The_Farage
[('people', 0.8675692384144159), ('get', 0.7403770213949141), ('like', 0.6986698060466588), ('say', 0.6548920197236675), ('would', 0.5960286448704563), ('eu', 0.5759145268301629), ('go', 0.5717733848806907), ('think', 0.5433769829414531), ('make', 0.5356862907495763), ('vote', 0.5140931934416143), ('leave', 0.4916127085730512), ('want', 0.4490181056641948), ('one', 0.43452410884104226), ('fuck', 0.4333409254269074), ('uk', 0.4043529317806023), ('farage', 0.3907463225180509), ('know', 0.38039346764437054), ('see', 0.37418175472016235), ('right', 0.36856163350302157), ('even', 0.3505180864374643), ('take', 0.3425315983920537), ('us', 0.34223580253852004), ('good', 0.3318829476648396), ('brexit', 0.3277418057153675), ('need', 0.3262628264476989), ('come', 0.29313369085192165), ('time', 0.2892883447559832), ('country', 0.27006161427629105), ('look', 0.26355410549854913), ('could', 0.25941296354907695), ('also', 0.2517222713572001), ('well', 0.24166521233705346), ('back', 0.23604

[('please', 1.0051527090263088), ('people', 0.9684983018302366), ('socialism', 0.9501186491622141), ('remove', 0.8244993801831154), ('work', 0.7292855863474471), ('get', 0.7161733188508963), ('like', 0.640343202737272), ('make', 0.5874295838454766), ('check', 0.5149936718323881), ('know', 0.5095876712616673), ('would', 0.5066130597210211), ('think', 0.4935307631216053), ('go', 0.45139168174983285), ('say', 0.4325062701926578), ('mean', 0.4272501241076119), ('slur', 0.41773436426725785), ('also', 0.4104851535227362), ('want', 0.4056411072732961), ('automatically', 0.4040414106387169), ('one', 0.3984031356152001), ('take', 0.38880870216986674), ('use', 0.37274430130552105), ('ban', 0.369387560826404), ('see', 0.3592349194219318), ('meet', 0.35711822481177435), ('question', 0.3562116051734414), ('post', 0.3561741415520227), ('pay', 0.33416051760638477), ('even', 0.3339095113428794), ('immediate', 0.33044787272378995), ('engels', 0.3267015105819183), ('friedrich', 0.3248021049759894), ('am

In [11]:
for subreddit in word_percentages:
    print(subreddit)
    print(sum(word_percentages[subreddit].values()))

The_Farage
99.99999999996076
Le_Pen
100.00000000005375
altright
100.00000000013179
progressive
100.00000000013843
Conservative
100.00000000121392
ukpolitics
100.00000000161782
LateStageCapitalism
99.999999999249
Libertarian
99.99999999850088


In [14]:
pickle_out = open('../../Results/com_word_freq_perc_alt.pickle','wb')
pickle.dump(word_percentages,pickle_out)
pickle_out.close()

In [48]:
word_percentages = pickle.load(gzip.open("../../Results/com_word_freq_perc.pickle.gz","rb"))

In [49]:
word_percentages_filterd = {}

for subreddit in word_percentages:
    if subreddit != 'NeutralPolitics':
        word_percentages_filterd[subreddit + "-NeutralPolitics"] = Counter()
        for word in word_percentages[subreddit]:
            word_percentages_filterd[subreddit + "-NeutralPolitics"][word] = word_percentages[subreddit][word]
            if word in word_percentages['NeutralPolitics']:
                word_percentages_filterd[subreddit + "-NeutralPolitics"][word] -= word_percentages['NeutralPolitics'][word]            

In [50]:
remove = ['','permittedhttpswwwredditcomrpoliticswikirulesandregswikinopersonalattacks',\
        'discussionhttpswwwredditcomrpoliticswikirulesandregswikipleasebecivil' \
        'httpimgurcom0cf3yty', 'subredditmessagecomposetorpolitics', \
        'discussionhttpswwwredditcomrpoliticswikirulesandregswikipleasebecivil', 'delete',\
        "httpimgurcom0cf3yty", "╚═███═╝","つ","◕◕","ubotsbyliam", \
        "rbotsbyliam","༽つ","subredditmessagecomposetorthedonald", \
        "༼","formhttpbotsbyliamcom", "utonysesek556",'͡°',"energyhttpswwwyoutubecomwatchvqligom24qqc"\
        "formhttpsdocsgooglecomformsd1hdofbq85c6qhks8ydg0le6uiffbfj5sabm6qtp3zavqviewformresponses", \
        "lightyears\r","httpiimgurcomy2t4j8ajpg","pagehttpsawswariomissedcomthewallgrows", \
        "suggestionshttpsdocsgooglecomformsd1nbt77un01wxb3opvwc8llesd1e8jqj5mnkhitd4qqqkviewform", \
        "httpwwwarchiveis","pepehttpsslimgufbmzs", "brigadinghttparchiveisi4col", \
         "moderatorshttpswwwredditcommessagecomposetorpoliticsampsubjectquestion", "im","gt",\
          "etc", "1", "2", "3", "4","5","6","7","8","9","10",
          "subredditmessagecomposetorneutralpolitics","ruleshttpwwwredditcomrneutralpoliticswikiguidelineswikis", \
          "guidelineshttpswwwredditcomrsandersforpresidentwikirules","rsandersforpresident", \
          "alonehttpswwwredditcomrmetarepublicancomments5t017athissubisforrepublicansifyoudonot", \
          "subredditmessagecomposetordemocrats"
         ]

for subreddit in word_percentages_filterd:
    for word,val in (word_percentages_filterd[subreddit].most_common(5000)):
        if len(word) > 20:
            remove.append(word)

for r in remove:
    for subreddit in word_percentages_filterd:
        del word_percentages_filterd[subreddit][r]

In [51]:
for other_subreddit in word_percentages:
    if other_subreddit != 'NeutralPolitics':
        key = 'NeutralPolitics-' + other_subreddit
        word_percentages_filterd[key] = Counter()
        for word in word_percentages['NeutralPolitics']:
            word_percentages_filterd[key][word] = word_percentages['NeutralPolitics'][word]
            if word in word_percentages[other_subreddit]:
                word_percentages_filterd[key][word] -= word_percentages[other_subreddit][word]            

In [42]:
for subreddit in word_percentages_filterd:
    print(subreddit)
    print(word_percentages_filterd[subreddit].most_common(25))

The_Farage-politics
[('eu', 0.5694430896395359), ('uk', 0.39492403591815933), ('farage', 0.3902673018476886), ('leave', 0.34445907381319174), ('brexit', 0.32414305927284204), ('desu', 0.22005727040476955), ('british', 0.21199023119569335), ('britain', 0.18659582180123765), ('ukip', 0.1695788928111311), ('barrage', 0.1648806896200584), ('europe', 0.16085644513853453), ('fuck', 0.15106937017729033), ('european', 0.14597838259150248), ('nigel', 0.13788225745238677), ('remain', 0.12977179376858156), ('muslims', 0.1234531248599251), ('labour', 0.1207086071957162), ('muslim', 0.11372216520227293), ('referendum', 0.11321458068192569), ('islam', 0.10145810127695684), ('country', 0.09677058877748637), ('london', 0.0966268323907357), ('police', 0.08776629493615354), ('racist', 0.08521067475157422), ('immigration', 0.08339987173106857)]
Le_Pen-politics
[('cocu', 5.821209050503136), ('france', 1.4851777910794846), ('de', 1.0670347576214543), ('cuck', 1.0383658629421455), ('le', 0.9902274579204378)

[('trump', 0.7822701904767171), ('vote', 0.36971346107824615), ('please', 0.244557716327777), ('clinton', 0.20241564125912592), ('president', 0.2006090939780289), ('hillary', 0.19462682446306565), ('say', 0.1789414366448916), ('comment', 0.17530320200006905), ('party', 0.16818646501989548), ('report', 0.15758695067879372), ('question', 0.15576585834748902), ('republicans', 0.13992414690620494), ('bernie', 0.1398074889193976), ('users', 0.1388468325939359), ('obama', 0.1369113174139317), ('state', 0.1335270979183112), ('attack', 0.12382480606957962), ('tax', 0.12295338358993255), ('pay', 0.12100833060736117), ('election', 0.11862895561133142), ('action', 0.11808198249955174), ('bot', 0.11490623520044585), ('automatically', 0.11308554683327518), ('campaign', 0.10908284150393484), ('news', 0.1080377527171119)]
politics-progressive
[('trump', 0.2647264354362957), ('please', 0.23461831632729313), ('question', 0.1725973965731029), ('comment', 0.1660093598765222), ('users', 0.1409705725489623

In [52]:
pickle_out = open('../../Results/com_word_freq_unique.pickle','wb')
pickle.dump(word_percentages_filterd,pickle_out)
pickle_out.close()