In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from itertools import combinations

from pymongo import MongoClient

from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF

import string
import re


import seaborn as sns

In [2]:
client = MongoClient("3.16.43.190")
db = client.redditproject

db.authenticate("redundant","monkey")

True

In [3]:
everything_filter = {"body":{"$nin":["[deleted]","[removed]"]}}
td_filter = {"body":{"$nin":["[deleted]","[removed]"]},"sub":"the_donald"}
fc_filter = {"body":{"$nin":["[deleted]","[removed]"]},"sub":"fullcommunism"}
con_filter = {"body":{"$nin":["[deleted]","[removed]"]},"sub":"conservative"}
lsc_filter =  {"body":{"$nin":["[deleted]","[removed]"]},"sub":"latestagecapitalism"}
pol_filter = {"body":{"$nin":["[deleted]","[removed]"]},"sub":"politics"}

In [4]:
td_corpus = [document["body"] for document in db.redditproject.find(td_filter,{"_id":0,"body":1})]
fc_corpus = [document["body"] for document in db.redditproject.find(fc_filter,{"_id":0,"body":1})]
con_corpus = [document["body"] for document in db.redditproject.find(con_filter,{"_id":0,"body":1})]
lsc_corpus = [document["body"] for document in db.redditproject.find(lsc_filter,{"_id":0,"body":1})]
pol_corpus = [document["body"] for document in db.redditproject.find(pol_filter,{"_id":0,"body":1})]
#corpus = [(document["body"],document["sub"]) for document in db.redditproject.find({},{"_id":0,"body":1,"sub":1})]

In [5]:
def clean_string(input_string):
        if '#Welcome to r/LateStageCapitalism\n***\n\n#' in input_string:
                            return ""
        if "Your post was removed" in input_string:
            return ""
        
        clean_s = re.sub("\(http.*\)","",input_string)
        clean_s = re.sub("/./","", clean_s)
        clean_s = re.sub("r/","", clean_s)
        clean_s = re.sub("-"," ", clean_s)
        clean_s = clean_s.translate(str.maketrans("\n"," ",string.punctuation))
        clean_s = re.sub("^[rR][eeEE][eE]*","ree",clean_s)
        clean_s = clean_s.lower()
        clean_s = re.sub("[^a-zA-Z ]*","", clean_s)
        clean_s = re.sub(" +"," ", clean_s)
        clean_s = re.sub("http[a-zA-Z]*","",clean_s)
        clean_s = re.sub("(maga)+","maga",clean_s)
        clean_s = re.sub("isnt "," ",clean_s)
        clean_s = re.sub("was "," ",clean_s)
        clean_s = re.sub('labour',"labor",clean_s)
        clean_s = re.sub("hilary", "hillary",clean_s)
        clean_s = re.sub("uspez","spez", clean_s)

        return clean_s

In [6]:
td_clean_corpus = [clean_string(document) for document in td_corpus]
con_clean_corpus = [clean_string(document) for document in con_corpus]
pol_clean_corpus = [clean_string(document) for document in pol_corpus]
lsc_clean_corpus = [clean_string(document) for document in lsc_corpus]
fc_clean_corpus = [clean_string(document) for document in fc_corpus]

In [7]:
clean_corpus = td_clean_corpus + con_clean_corpus + pol_clean_corpus + lsc_clean_corpus + fc_clean_corpus

In [8]:
class RedditCommentProcessor:
    def __init__(self,stemmer,vectorizer):
        self.stemmer=stemmer
        self.vectorizer = vectorizer
    
    def fit(self, corpus):
        clean_corpus = list(filter(self.__space_or_empty_filter__,corpus))
        
        lem_corpus = [" ".join([self.stemmer.lemmatize(word) for word in clean_s.split(" ")]) for clean_s in clean_corpus]
        self.vectorizer.fit(lem_corpus)
    
    def __space_or_empty_filter__(self, string):
        if string.isspace():
            return False
        if string =="":
            return False
        return True
    
    def get_feature_names(self):
        return self.vectorizer.get_feature_names()
    
    def transform(self, corpus):
        clean_corpus = list(filter(self.__space_or_empty_filter__,corpus))
        lem_corpus = [" ".join([self.stemmer.lemmatize(word) for word in clean_s.split(" ")]) for clean_s in clean_corpus]

        return self.vectorizer.transform(lem_corpus)

In [9]:
stop_words = stopwords.words("english")+["tldr","shit","fuck","wasnt","using","ago","around","fucking","mr","literally","suck","push","time", "actually","theyre","think","said","never", "every", "long", "new", "day", "last", "back", "take", "first", "next","need", "much", "white", "well", "mean", "could", "someone", "many","make", "even","know", "cant", "something", "say", "want", "way", "see", "doesnt", "point", "good","hey","like","dont","ive","still","doe","yeah","really","lot","one","le","would","ha","fact","sure","man","didnt","nd","people","link","thing","youre","also","thats","im","go","year","going","get"]

In [10]:
proc = RedditCommentProcessor(WordNetLemmatizer(), CountVectorizer(stop_words=stop_words,min_df=100, max_df=.2))

In [11]:
proc.fit(clean_corpus)

In [12]:
td_matrix = proc.transform(td_clean_corpus)
fc_matrix = proc.transform(fc_clean_corpus)
pol_matrix = proc.transform(pol_clean_corpus)
con_matrix = proc.transform(con_clean_corpus)
lsc_matrix = proc.transform(lsc_clean_corpus)

In [13]:
feature_names = proc.get_feature_names()

In [14]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

# I tried LSA, but NMF seemed to give more coherent topics, along with the benefit of interpretability (no negative weights on words)

In [15]:
td_nmf = NMF(10,init="nndsvd")
td_topics = td_nmf.fit_transform(td_matrix)

In [16]:
con_nmf = NMF(10,init="nndsvd")
con_topics = con_nmf.fit_transform(con_matrix)

In [17]:
lsc_nmf = NMF(10,init="nndsvd")
lsc_topics = lsc_nmf.fit_transform(lsc_matrix)

In [18]:
fc_nmf = NMF(10,init="nndsvd")
fc_topics = fc_nmf.fit_transform(fc_matrix)

In [None]:
pol_nmf = NMF(10,init="nndsvd")
pol_topics = pol_nmf.fit_transform(pol_matrix)

In [None]:
display_topics(td_nmf, feature_names, 10)

In [None]:
display_topics(fc_nmf, feature_names,10)

In [None]:
display_topics(lsc_nmf, feature_names,10)

In [None]:
display_topics(con_nmf, feature_names,10)

In [None]:
display_topics(pol_nmf, feature_names,10)

# Now let's graphically look at the similarity of different topics between these communities

In [None]:
def cosine_similarity(top_1,top_2):
    return np.dot(top_1,top_2)/(np.linalg.norm(top_1)*np.linalg.norm(top_2))

In [None]:
def topic_similarity(sub_1,sub_2,same=False):
    
    for i in range(10):
        for j in range(0,10):
            c_s = cosine_similarity(sub_1[i],sub2[j])
            if c_s > 1e-1:
                print("Topics ({},{}) are {} similar".format(i,j,c_s))

In [None]:
def heatmap_generator(sub_1,sub_2, same=False):
    a=np.zeros((10,10))
    for i in range(10):
        for j in range(0,10):
            c_s = cosine_similarity(sub_1[i],sub_2[j])
            a[i][j]=c_s
            if same and i==j:
                a[i][j]=0
    return a

In [None]:
sns.heatmap(heatmap_generator(td_nmf.components_,con_nmf.components_,False))

## Here I'm interested in how next to no topics are in common between The_Donald and conservative. 

In [None]:
sns.heatmap(heatmap_generator(pol_nmf.components_,con_nmf.components_,False))
plt.xlabel("Conservative")
plt.ylabel("Politics")
plt.title("Topic Similarity (Brighter = Higher)")
plt.savefig("pol_con.png",dpi=300)

## Here the two bright squares at 1,3 and 2,1 are interesting.

In [None]:
sns.heatmap(heatmap_generator(pol_nmf.components_,lsc_nmf.components_,False))
plt.xlabel("Late Stage Capitalism")
plt.ylabel("Politics")
plt.title("Topic Similarity (Brighter = Higher)")
plt.savefig("pol_lsc.png",dpi=300)

Topic 0 of Late Stage Capitalism and topic 5 of Politics have some interesting features (i.e. bright lines)

In [None]:
sns.heatmap(heatmap_generator(con_nmf.components_,lsc_nmf.components_,False))
plt.xlabel("Late Stage Capitalism")
plt.ylabel("Conservative")
plt.title("Topic Similarity (Brighter = Higher)")
plt.savefig("con_lsc.png",dpi=300)

Similarly here topic 0 and topic 7 overlap well.

In [None]:
sns.heatmap(heatmap_generator(lsc_nmf.components_,fc_nmf.components_,False))

## Hmm, looks like Full communism doesn't really overlap with late stage capitalism. This is curious ...