### Import Required Libraries

In [None]:
import pickle
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

### Load comments corpus 

In [None]:
comments_corpus = pickle.load(open("../../Results/comments_corpus.pickle","rb"))

### Simple PreProcessing (Remove punctuation, \n and abbreviations) and counting of words

In [None]:
words_frequency = {}

for subreddit in comments_corpus:
    words_frequency[subreddit] = Counter()
    print("Working on {} subreddit".format(subreddit))
    print("The subreddit contains {} comments".format(len(comments_corpus[subreddit])))
    for key,comment in enumerate(comments_corpus[subreddit]):
        for word in comment.replace('\n',' ').split(' '):
            filtered_word = word.lower().split("'")[0].translate(str.maketrans('', '', string.punctuation))
            words_frequency[subreddit][filtered_word] += 1
        if key % 100000 == 0:
            print("{} comments finished for {}".format(key,subreddit))

### Lemmatizing words and removing Stopwords

In [None]:
words_frequency_lemmatized = {}

stop_words = set(stopwords.words('english'))
lem = WordNetLemmatizer()

for subreddit in comments_corpus:
    words_frequency_lemmatized[subreddit] = Counter()
    print("Working on {} subreddit".format(subreddit))
    print("The subreddit contains {} unique words".format(len(words_frequency[subreddit])))
    for key,word in enumerate(words_frequency[subreddit]):
        if word not in stop_words:
            word_lem = lem.lemmatize(word,"v")
            words_frequency_lemmatized[subreddit][word_lem] += words_frequency[subreddit][word]
            if key % 1000000 == 0:
                print("{} words finished for {}".format(key,subreddit))

### Filter more words after manual inspection (Reddit specific characters)

In [None]:
remove = ['','permittedhttpswwwredditcomrpoliticswikirulesandregswikinopersonalattacks',\
        'discussionhttpswwwredditcomrpoliticswikirulesandregswikipleasebecivil' \
        'httpimgurcom0cf3yty', 'subredditmessagecomposetorpolitics', \
        'discussionhttpswwwredditcomrpoliticswikirulesandregswikipleasebecivil', 'delete',\
        "httpimgurcom0cf3yty", "╚═███═╝","つ","◕◕","ubotsbyliam", \
        "rbotsbyliam","༽つ","subredditmessagecomposetorthedonald", \
        "༼","formhttpbotsbyliamcom", "utonysesek556",'͡°',"energyhttpswwwyoutubecomwatchvqligom24qqc"\
        "formhttpsdocsgooglecomformsd1hdofbq85c6qhks8ydg0le6uiffbfj5sabm6qtp3zavqviewformresponses", \
        "lightyears\r","httpiimgurcomy2t4j8ajpg","pagehttpsawswariomissedcomthewallgrows", \
        "suggestionshttpsdocsgooglecomformsd1nbt77un01wxb3opvwc8llesd1e8jqj5mnkhitd4qqqkviewform", \
        "httpwwwarchiveis","pepehttpsslimgufbmzs", "brigadinghttparchiveisi4col", \
         "moderatorshttpswwwredditcommessagecomposetorpoliticsampsubjectquestion", "im","gt",\
          "etc", "1", "2", "3", "4","5","6","7","8","9","10",
          "subredditmessagecomposetorneutralpolitics","ruleshttpwwwredditcomrneutralpoliticswikiguidelineswikis", \
          "guidelineshttpswwwredditcomrsandersforpresidentwikirules","rsandersforpresident", \
          "alonehttpswwwredditcomrmetarepublicancomments5t017athissubisforrepublicansifyoudonot", \
          "subredditmessagecomposetordemocrats"
         ]

for r in remove:
    for subreddit in comments_corpus:
        del words_frequency_lemmatized[subreddit][r]

### Transform absolute word counts into relative counts

In [None]:
words_count_total = {}
word_percentages = {}

for subreddit in words_frequency_lemmatized:
    words_count_total[subreddit] = sum(words_frequency_lemmatized[subreddit].values())
    word_percentages[subreddit] = Counter()
    for word in words_frequency_lemmatized[subreddit]:
        word_percentages[subreddit][word] = (100 * words_frequency_lemmatized[subreddit][word])/ float(words_count_total[subreddit])
        
pickle_out = open('../../Results/WordCounts/word_freq.pickle','wb')
pickle.dump(word_percentages,pickle_out)
pickle_out.close()