# 1.0 Initializations

In [35]:
!python -V

Python 3.7.10


In [2]:
#!pip install gensim

import pandas as pd
import numpy as np
import os
import timeit
import zipfile as zf
import pickle
import json
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import common_corpus, common_dictionary, common_texts
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import cross_val_score, cross_validate
from nltk.stem import PorterStemmer, WordNetLemmatizer
#snowball and lancaster stemmers also available
from nltk.tokenize import word_tokenize

Slow version of gensim.models.doc2vec is being used
Slow version of Fasttext is being used
[nltk_data] Downloading package wordnet to /opt/conda/nltk_data...
[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /opt/conda/nltk_data...


In [3]:
#Code to unzip files in the Jupyter directories


# files = zf.ZipFile("archive.zip", 'r')
# files.extractall()
# files.close()

os.getcwd()

'/home/jovyan/work/reddit_data'

In [4]:
#FOR PARAMETER TUNING, CREATE SAVEFILE OF DIFFERENT PARAMETERS.

# df_save = pd.DataFrame(columns = ['run_num', 'max_feat', 'min_df', 'max_df', 'ngram_range',
#        'n_components', 'random_state', 'top_terms', 'coherence_type', "num_features",
#        'coherence', 'perplexity', 'cohe_folds', 'perp_folds'])
# BE CAREFUL! IF YOU RUN THE BELOW, YOU MAY OVERWRITE VALUABLE DATA!
# df_save.to_csv('GME_record.csv', index = False)
# del df_save

# 2.0 Definitions

## 2.1 Preprocessing (Lemmatization, Build List of Documents)

In [5]:
#Builds a list of text documents, where "folders" parameter is the directories.
#Removes duplicates inside and between folders

#TAKES THE CSV'S FROM DIFFERENT SUBREDDITS AND COMBINES THEM. DROPS IRRELEVANT FIELDS.
def build_and_simplify_dataframe(folders):
    
    full_df = pd.DataFrame(columns = ['id', 'title', 'selftext'])
    for fold in folders:
        df = pd.read_csv(fold + "/submissions_reddit.csv")
        df = df[['id', 'title', 'selftext']] 
        full_df = full_df.append(df)
    return full_df

#LEMMATIZE BY BREAKING APART AND REBUILDING
def lemmatize_breakup(doc):
    lemmatizer = WordNetLemmatizer()
    toks = word_tokenize(doc)
    new = [lemmatizer.lemmatize(t.lower(), pos = "v") for t in toks]
    new = [lemmatizer.lemmatize(t.lower(), pos = "n") for t in new]
    new = [lemmatizer.lemmatize(t.lower(), pos = "a") for t in new]
    new = [lemmatizer.lemmatize(t.lower(), pos = "r") for t in new]
    new = [lemmatizer.lemmatize(t.lower(), pos = "s") for t in new]
    strang = " ".join(new)
    return strang

#REMOVES IRRELEVANT POSTS, SAVES NEW DF TO FILE, INCORPORATES LEMMATIZER ABOVE. 
def build_and_lemmatize_text_list(df, lemmatize = True, save_string = "GME_df"):
    doc_list = []
    index_list = []
    print("Dateframe size prior to dropping stuff: " + str(len(df)))
    df.dropna(axis = 0, how = 'any', subset = ['selftext'], inplace = True)
    df = df[df["selftext"] != "[deleted]"]
    df = df[df['selftext'] != "[removed]"]
    df.drop_duplicates(subset = 'selftext', keep = "first", inplace = True)
    
    if lemmatize == True:
        
        df['selftext'] = df['selftext'].apply(lemmatize_breakup)
        df.to_csv(save_string, index = False)
        print("New " + save_string + ".csv file saved to directory.")
        
    return df

## 2.2 Vectorization of Word Documents

In [6]:
#Returns the sparse vectors 'v' and the feature names for the sparse vectors. 

def get_vectors(doc_list, max_features = 20000, strip_accents = None, preprocessor = None,
                lowercase = True, min_df = 50, max_df = 0.90, ngram_range = (1,1), stop_words = 'english'):
    
    model = CountVectorizer(max_features = max_features, strip_accents = strip_accents, 
                            preprocessor = preprocessor, lowercase = lowercase, 
                            min_df = min_df, max_df = max_df, ngram_range = ngram_range, 
                            stop_words = stop_words)
    
    tf_matrix = model.fit_transform(doc_list)
    
    try:
        return {
            'matrix': tf_matrix, 
            'vectorizer': model, 
            'features': model.get_feature_names_out(), 
            'parameters': {'max_features': max_features, 
                           'min_df': min_df, 
                           'max_df': max_df, 
                           'ngram_range': ngram_range}}
    except:
        return {'matrix': tf_matrix, 'vectorizer': model, 'features': model.get_feature_names(), 
                'parameters': {'max_features': max_features, 
                           'min_df': min_df, 
                           'max_df': max_df, 
                           'ngram_range': ngram_range}}

## 2.3 Coherence

In [7]:
#DEFINITIONS

#Gets the 'topics' as input into the coherence algorithm. Is a nested list of top ten terms for each topic.
def get_top_terms(lda_model_components, tf_matrix_words, n_terms = 10):
    top_terms = []
    for index, term_weights in enumerate(lda_model_components):
        topic_top_terms = [tf_matrix_words[i] for i in term_weights.argsort()[::-1][:n_terms]]
#         print(term_weights.argsort()[::-1][:n_terms])
#         print(term_weights[term_weights.argsort()[::-1][:n_terms]])
#         print(term_weights.mean())
#         print(topic_top_terms)
        top_terms.append(topic_top_terms)       
    return top_terms  

#Gets specific dictionary object for use in GenSim - contains indexes of the words. 
def get_gensim_dict(tf_matrix_words):
    word_dict = Dictionary([tf_matrix_words])
    return word_dict
            
#Combines above functions with other measures to acquire coherence model. 
def cohe_score_func(estimator,
                    X,
                    feat_names,
                    #get_macro_score = False, 
                    n_terms = 10, 
                    coherence_type = "u_mass",
                    ):
    
    model_components = estimator.components_
    
    topics = get_top_terms(model_components, feat_names, n_terms = n_terms)
    
    dictionary = get_gensim_dict(feat_names)
    
    #Puts together a bag-of-words in the sparse format readable by GenSim. 
    corpus = Sparse2Corpus(X, documents_columns = False)
    
    cm = CoherenceModel(topics = topics, corpus = corpus, dictionary = dictionary, coherence = coherence_type)

    return cm.get_coherence(), cm.get_coherence_per_topic()

## 2.4 Cross Validation/Perplexity

In [8]:
#Custom score function for use in the cross validation scorer. 
def perp_score_func(estimator, X):
    perplexity = estimator.perplexity(X)
    return perplexity


#Get's the perplexity scores only without acquiring other info of the model folds. 
def lda_cross_val_perplexity(estimator, X, return_fold_scores = False):
    #perplexity = perp_score_func(estimator, X)
    scores = cross_val_score(estimator, X, scoring = perp_score_func)
    mean_score = scores.mean()
    if return_fold_scores == True:
        return (scores, mean_score)
    else:
        return mean_score
    
    
#Returns full stats including perplexity, but not including coherence. Returns estimator for coherence function. 
def lda_cross_val(X = None, feat_names = None, docs = None, return_fold_score = False, n_topics = 2):
    
    if X == None:
        try: 
            vectorizer = get_vectors(docs)
            X = vectorizer[0]
            feat_names = vectorizer[2]
        except:
            print("Need to input document list to acquire term frequency matrix ('doc_list' parameter).\
            \nAlternatively, can also enter matrix ('X' parameter) AND feature names ('feat_names' parameter) directly.")
    
    estimator = LatentDirichletAllocation(n_components = n_topics)
    
    scorer_dict = {"Perplexity": perp_score_func}
   
    stats = cross_validate(estimator, X, scoring = scorer_dict, return_estimator = True)

    return stats


#Gets coherence values for the estimators saved in the cross validation function above. 
def get_cross_val_coherence(estimator_list, matrix, feat_names, n_terms = 3):
    coherence_stats = []
    for est in estimator_list:
        cohe_, cohe_topics = cohe_score_func(est, matrix, feat_names, n_terms = n_terms)
        coherence_stats.append((cohe_, cohe_topics))
    return coherence_stats

## 2.5 Parameter Tuning

In [9]:
def get_parameters_df(p_d, record):
    #DETERMINE IF 'RECORD' ALEADY HAS RUNS LOGGED. DETERMINE RUN NUMBER.
    if np.isnan(record['run_num'].max()):
        run = 1
    else:
        run = record['run_num'].max() + 1
    #CREATE TESTING DATAFRAME
    df_cols = ['run_num', 'max_feat', 'min_df', 'max_df', 
               'ngram_range','n_components', 'random_state', 'top_terms', 'coherence_type']
    df = pd.DataFrame(columns = df_cols)
    for max_f in p_d['max_feat']:
        for mndf in p_d['min_df']:
            for mxdf in p_d['max_df']:
                for gram in p_d['ngram_range']:
                    for comp in p_d['n_components']:
                        for rs in p_d['random_state']:
                            for term in p_d['top_terms']:
                                df.loc[len(df.index)] = [run, max_f, mndf, mxdf, gram, 
                                                         comp, rs, term, p_d['coherence_type']]
    return df



def initialize_vectorizer(doc_input, initial_vectorizer):
    #DETERMINE IF INPUT IS A VECTORIZER OBJECT OR A DOCUMENT LIST. ASSIGN VARIABLES.
    #VECTORIZER OBJECT WILL COME FROM CUSTOM FUNCTION "get_vectorizer". 
    if type(doc_input) is list and type(doc_input[0]) is str:
        doc_list = doc_input
        vectorizer = initial_vectorizer
        try:
            existing_params = list(initial_vectorizer['parameters'].values())
            existing_features = initial_vectorizer['features']
            existing_vector = initial_vectorizer['matrix']
        except:
            print("There is an issue with the inputted initial vectorizer, or no vectorizer has been entered. \
A new vectorizer will be trained.")
            existing_params = []
            existing_features = None
            existing_vector = None
    elif type(doc_input) is dict and str(type(doc_input['matrix'])) == "<class 'scipy.sparse.csr.csr_matrix'>":
        vectorizer = doc_input
        existing_params = list(vectorizer['parameters'].values())
        existing_features = vectorizer['features']
        existing_vector = vectorizer['matrix']
        doc_list = None       
    else:
        print("Parameter 'doc_input' needs to be a vectorizer library or a document list.")
        return None
    
    return doc_list, existing_params, existing_features, existing_vector


def get_new_record_line(param_series, stats, coherence_stats): 
    summation = []
    for i in list(coherence_stats):
        summation.append(i[0])
    #Below is average coherence over the topics and over the folds. 
    cohe = np.array(summation).mean()

    perp = stats['test_Perplexity'].mean()
    cohe_folds = coherence_stats
    perp_folds = list(stats['test_Perplexity'])
    
    param_series['coherence'] = cohe
    param_series['perplexity'] = perp
    param_series['cohe_folds'] = cohe_folds
    param_series['perp_folds'] = perp_folds
    
    return param_series


def find_parameters(doc_input,
                    p_d,
                    #THE BELOW IS A LIBRARY!
                    initial_vectorizer = None, 
                    record_string = 'record.csv'):
    
    #OPENS RECORD CSV TO RECORD RESULTS
    record = pd.read_csv(record_string)
    #CREATE TESTING DATAFRAME
    df = get_parameters_df(p_d, record)

    #DETERMINE IF INPUT IS A VECTORIZER OBJECT OR A DOCUMENT LIST. ASSIGN VARIABLES. 
    doc_list, existing_params, existing_features, existing_vector = initialize_vectorizer(doc_input, initial_vectorizer)
    
    #RUN THROUGH TEST QUEUE.
    for i in list(df.index):
        tic = timeit.default_timer()
        param_series = df.loc[i]
        
        #CHECK TO SEE IF EXISTING VECTORIZER PARAMETERS NEED TO BE CHANGED. SKIP VECTORIZATION IF NOT.
        proposed_params = list(df.loc[i][['max_feat', 'min_df', 'max_df', 'ngram_range']])
        if proposed_params != existing_params:
            try:
                vectorizer = get_vectors(doc_list, 
                                         max_features = proposed_params[0], 
                                         min_df = proposed_params[1], 
                                         max_df = proposed_params[2], 
                                         ngram_range = proposed_params[3])
                existing_params = list(vectorizer['parameters'].values())
                existing_features = vectorizer['features']
                existing_vector = vectorizer['matrix']
            except:
                print("Function is trying to create a new term-frequency matrix, but does not have a document listed inputed.")
        
        #ADD NUMBER OF FEATURES TO SERIES (SO IT'S RECORDED IN THE SAVEFILE). 
        param_series['num_features'] = len(existing_features)
        
        #RUN CROSS VALIDATION LDA
        stats = lda_cross_val(X = existing_vector, 
                              feat_names = existing_features, 
                              docs = None, 
                              return_fold_score = False, 
                              n_topics = param_series['n_components'])
        
        #GET COHERENCE SCORES
        coherence_stats = get_cross_val_coherence(stats['estimator'], 
                                                   existing_vector, 
                                                   existing_features, 
                                                   n_terms = param_series['top_terms'])
        
        #CREATE NEW LINE IN RECORD DATAFRAME, AND SAVE TO DISK.
        param_series = get_new_record_line(param_series, stats, coherence_stats)
        record = record.append(param_series, ignore_index = True)
        record.to_csv(record_string, index = False)
        toc = timeit.default_timer()
        print("New record saved to " + record_string)
        print("Time to process: " + str((toc - tic)/60) + " minutes.")
    
    record.to_csv(('z_GME_backup/' + record_string), index = False)
    return record


In [10]:
pd.read_csv('GME_record.csv').tail(10)

Unnamed: 0,run_num,max_feat,min_df,max_df,ngram_range,n_components,random_state,top_terms,coherence_type,num_features,coherence,perplexity,cohe_folds,perp_folds
97,28,2000,25,0.9,"(1, 3)",30,,10,u_mass,2000,-1.613478,701.198114,"[(-1.681994367957814, [-1.5440949207642278, -1...","[699.8871014787168, 713.4140075542011, 708.125..."
98,28,2000,25,0.9,"(1, 3)",40,,10,u_mass,2000,-1.668558,708.835664,"[(-1.7319069657377444, [-2.134157983378171, -2...","[721.9857869915812, 727.837378756216, 725.3954..."
99,29,5000,25,0.9,"(1, 3)",3,,10,u_mass,5000,-1.01054,1440.806898,"[(-1.0453698646405527, [-1.0530402121555689, -...","[1384.1721391853207, 1404.3580567263928, 1488...."
100,29,5000,25,0.9,"(1, 3)",6,,10,u_mass,5000,-1.055711,1317.84061,"[(-1.075684503870544, [-0.35808804343104356, -...","[1295.6487029522536, 1291.9178808049646, 1318...."
101,29,5000,25,0.9,"(1, 3)",9,,10,u_mass,5000,-1.213906,1289.216347,"[(-1.1437053231712997, [-0.9178634505267752, -...","[1246.018339541864, 1268.1286730278662, 1287.0..."
102,29,5000,25,0.9,"(1, 3)",12,,10,u_mass,5000,-1.313875,1269.082053,"[(-1.3826327438814143, [-1.130950762375651, -1...","[1261.9101954802672, 1271.790242408108, 1267.2..."
103,29,5000,25,0.9,"(1, 3)",15,,10,u_mass,5000,-1.428041,1263.52317,"[(-1.3544770449921797, [-0.6017384438101522, -...","[1236.6454153102898, 1265.9256360283305, 1267...."
104,29,5000,25,0.9,"(1, 3)",20,,10,u_mass,5000,-1.456773,1268.237848,"[(-1.4512366231497817, [-1.423856902013662, -1...","[1267.0852759802863, 1280.2341195701515, 1267...."
105,29,5000,25,0.9,"(1, 3)",30,,10,u_mass,5000,-1.631729,1287.696451,"[(-1.6646458271124536, [-1.6244254120172201, -...","[1275.2662086224757, 1289.2456306405647, 1286...."
106,29,5000,25,0.9,"(1, 3)",40,,10,u_mass,5000,-1.695101,1314.825297,"[(-1.7278349322838182, [-1.6245292305320893, -...","[1324.7586559714816, 1301.6655214716618, 1324...."


# 3.0 Clean and Preprocess Data

Inputs: 
    'folder_list' - list of all the subreddit folder to process to create a word list object.
    
Outputs:
    'doc_list' - list of lemmatized documents

Save Files:
    dataframe (.csv, name provided in function) - stored so a new doc_list can be made quickly

In [11]:
#List of folders from which to pull text data - each folder name represents a sub-reddit. 

folder_list = [
#                'finance',
#                'financialindependence',
#                'forex',
               'gme',
#                'investing',
#                'options',
#                'pennystocks',
#                'personalfinance',
#                'robinhood',
#                'robinhoodpennystocks',
#                'securityanalysis',
#                'stockmarket',
#                'stocks',
#                'wallstreetbets'
              ]

In [12]:
#CREATES DF OF PERTINENT POSTS AND LEMMATIZES EACH MAIN BODY OF TEXT. 
#WARNING! LONG RUN TIME! (APPX. 20 MINUTES FOR ALL 300,000+ TEXT FILES)

# tic = timeit.default_timer()

# df = build_and_simplify_dataframe(folder_list)

# toc1 = timeit.default_timer()
# print("DF created, time to process: " + str(toc1 - tic) + " seconds")

# df = build_and_lemmatize_text_list(df, save_string = "GME_df.csv")

# toc2 = timeit.default_timer()
# print("Lists created, time to process: " + str(toc2 - toc1))

# df

In [13]:
df = pd.read_csv('GME_df.csv')
df.head(5)

Unnamed: 0,id,title,selftext
0,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,after watch this i take a position right away ...
1,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,this guy explain exactly how to take a positio...
2,krnthg,ICR conference (11th Jan),any speculation or idea on what gamestop might...
3,kuo3w1,"GME is FINALLY going to the moon, this technic...","after some downwards movement , i think everyb..."
4,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",guy ... we retard have fantasize a long time a...


In [14]:
#CREATES LIST OF TEXT POSTS FROM DATAFRAME

doc_list = list(df['selftext'])
del df

In [15]:
#TEST
print('Number of docs: ')
print(len(doc_list))
doc_list[0]

Number of docs: 
94039


'after watch this i take a position right away 🚀 http : //youtu.be/mjhs9yg8kfe'

# 4.0 Parameter Tuning via Cross Validation

In [16]:
p_d = {"max_feat": [20000], 
       "min_df": [25], 
       "max_df": [0.9], 
       "ngram_range": [(1, 3)], 
       "n_components": [30, 40], 
       "random_state": [None],
       "top_terms": [10],
       "coherence_type": "u_mass", 
      }

In [17]:
record = pd.read_csv('GME_record.csv')
# record.append(series)
record

Unnamed: 0,run_num,max_feat,min_df,max_df,ngram_range,n_components,random_state,top_terms,coherence_type,num_features,coherence,perplexity,cohe_folds,perp_folds
0,1,10000,25,0.9,"(1, 1)",3,,10,u_mass,8818,-1.172181,1528.690373,"[(-1.2391441707465813, [-1.363810344020599, -1...","[1483.379826238543, 1438.6850387721222, 1523.1..."
1,1,10000,25,0.9,"(1, 1)",10,,10,u_mass,8818,-1.385576,1482.933114,"[(-1.3321428129677502, [-1.8733632817399162, -...","[1435.1529394856345, 1436.2164481286475, 1461...."
2,1,10000,25,0.9,"(1, 1)",20,,10,u_mass,8818,-1.563672,1552.835436,"[(-1.5901819655386853, [-1.969168522668718, -1...","[1525.3552503362255, 1528.414094872189, 1545.5..."
3,1,10000,25,0.9,"(1, 1)",30,,10,u_mass,8818,-1.727125,1625.043304,"[(-1.6743555429386885, [-2.830421254356006, -1...","[1626.0165103640497, 1594.8041236529816, 1649...."
4,1,10000,25,0.9,"(1, 1)",40,,10,u_mass,8818,-1.815742,1736.758371,"[(-1.8387481941871464, [-1.3055692832481556, -...","[1739.6433707081894, 1720.7549714181073, 1764...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,29,5000,25,0.9,"(1, 3)",12,,10,u_mass,5000,-1.313875,1269.082053,"[(-1.3826327438814143, [-1.130950762375651, -1...","[1261.9101954802672, 1271.790242408108, 1267.2..."
103,29,5000,25,0.9,"(1, 3)",15,,10,u_mass,5000,-1.428041,1263.523170,"[(-1.3544770449921797, [-0.6017384438101522, -...","[1236.6454153102898, 1265.9256360283305, 1267...."
104,29,5000,25,0.9,"(1, 3)",20,,10,u_mass,5000,-1.456773,1268.237848,"[(-1.4512366231497817, [-1.423856902013662, -1...","[1267.0852759802863, 1280.2341195701515, 1267...."
105,29,5000,25,0.9,"(1, 3)",30,,10,u_mass,5000,-1.631729,1287.696451,"[(-1.6646458271124536, [-1.6244254120172201, -...","[1275.2662086224757, 1289.2456306405647, 1286...."


In [18]:
#find_parameters(doc_list, p_d, record_string = "GME_record.csv")

In [24]:
df = pd.read_csv('GME_record.csv')

In [24]:
df.sort_values('n_components')

Unnamed: 0,run_num,max_feat,min_df,max_df,ngram_range,n_components,random_state,top_terms,coherence_type,num_features,coherence,perplexity,cohe_folds,perp_folds
0,1,10000,25,0.9,"(1, 1)",3,,10,u_mass,8818,-1.172181,1528.690373,"[(-1.2391441707465813, [-1.363810344020599, -1...","[1483.379826238543, 1438.6850387721222, 1523.1..."
15,5,10000,25,0.9,"(1, 2)",4,,10,u_mass,10000,-1.05043,2103.727534,"[(-1.0391821176514395, [-1.646679446023103, -0...","[2096.4966468691937, 2032.5507206959644, 2074...."
9,3,10000,25,0.9,"(1, 1)",5,,10,u_mass,8818,-1.221677,1480.097254,"[(-1.4201418351231556, [-1.1389884083412343, -...","[1385.6762160773076, 1419.7191744450888, 1468...."
16,5,10000,25,0.9,"(1, 2)",6,,10,u_mass,10000,-1.084535,2048.527071,"[(-0.971993963923734, [-0.8140977698945425, -0...","[1990.5583623212665, 2016.3623644603804, 2037...."
10,3,10000,25,0.9,"(1, 1)",6,,10,u_mass,8818,-1.244094,1482.244673,"[(-1.2939134199560203, [-2.100858650614365, -1...","[1410.2733332440187, 1418.3230734357796, 1477...."
11,3,10000,25,0.9,"(1, 1)",7,,10,u_mass,8818,-1.309148,1473.185606,"[(-1.264279204904817, [-2.307885338629059, -1....","[1402.198435690982, 1420.1859106179938, 1455.4..."
17,5,10000,25,0.9,"(1, 2)",8,,10,u_mass,10000,-1.195237,2027.97765,"[(-1.0964963038761097, [-1.524741566720634, -1...","[1966.7154051411824, 2059.875452538132, 2013.6..."
5,2,10000,25,0.9,"(1, 1)",8,,10,u_mass,8818,-1.3207,1462.675878,"[(-1.3554903209090683, [-1.1485249662482713, -...","[1406.7482394160625, 1427.3518778450332, 1451...."
12,3,10000,25,0.9,"(1, 1)",9,,10,u_mass,8818,-1.347258,1465.17776,"[(-1.2611901433901975, [-0.5692034185424099, -...","[1398.3253012799523, 1429.2095769504313, 1445...."
6,2,10000,25,0.9,"(1, 1)",10,,10,u_mass,8818,-1.3561,1467.694896,"[(-1.3354496607017083, [-1.1872173627436728, -...","[1423.3965773847935, 1425.8957347701153, 1455...."


# 4.0 Term Frequency Vectors

Parameter tuning via Cross Validation showed that coherence scores were best when using unigrams, bigrams, and trigrams consecutively. Coherence of the top 3 topics generally improved as more topics were added, but the rate of improvement decreased at about 10 topics. Furthermore, the rate of coherence improvement of the top topic rapidly decreased at about 10 topics. Number of features did not seem to make much of a difference in regards to coherence. 

For perplexity, the higher n-grams seemed to increase the perplexity. Since trigrams were to be used, the appropriate number of features was chosen on the best fit perplexity plots with trigrams at a topic count of 10. This cooresponds to 10,000 and 20,000 features. 

In [18]:
#Runs the vector model to get a tf matrix and associated features. Also outputs Parameters. 
# vectorizer_20k = get_vectors(doc_list, max_features = 20000, min_df = 25, max_df = 0.9, ngram_range = (1, 3))
# vectorizer_10k = get_vectors(doc_list, max_features = 10000, min_df = 25, max_df = 0.9, ngram_range = (1, 3))

In [23]:
#SAVE NEW MODELS

# with open('vectorizer_10k.pkl', 'wb') as f:
#     pickle.dump(vectorizer_10k, f)
    
with open('vectorizer_20k.pkl', 'wb') as f:
    pickle.dump(vectorizer_20k, f)

In [24]:
#LOAD THE TWO VECTORIZER MODELS ALREADY TRAINED
# with open('vectorizer_10k.pkl', 'rb') as f:
#     vectorizer_10k = pickle.load(f)
    
# with open('vectorizer_20k.pkl', 'rb') as f:
#     vectorizer_20k = pickle.load(f)

In [25]:
print("Vectorizer 20K:")
print(vectorizer_20k['matrix'].shape)
print("Number of documents: ", vectorizer_20k['matrix'].shape[0])
print("Number of words: ", vectorizer_20k['matrix'].shape[1])
print("\n")
print("Vectorizer 10K:")
print(vectorizer_10k['matrix'].shape)
print("Number of documents: ", vectorizer_10k['matrix'].shape[0])
print("Number of words: ", vectorizer_10k['matrix'].shape[1])

Vectorizer 20K:
(94039, 20000)
Number of documents:  94039
Number of words:  20000


Vectorizer 10K:
(94039, 10000)
Number of documents:  94039
Number of words:  10000


In [26]:
#TEST
print(vectorizer_20k['matrix'][80, 0:1000])

  (0, 902)	1


In [27]:
#TEST
vectorizer_20k['features'][5000:5010]

['diligence http',
 'diligent',
 'dilute',
 'dilute share',
 'dilution',
 'dime',
 'dimension',
 'dimensional',
 'diminish',
 'ding']

In [28]:
#TEST
print(vectorizer_20k['parameters'])

{'max_features': 20000, 'min_df': 25, 'max_df': 0.9, 'ngram_range': (1, 3)}


# 5.0 Latent Dirichlet Allocation Model

Train and save two models (for 20k and 10k features). 

## 5.1 Feature Count = 10k, Topic Count = 10

In [21]:
#Train the LDA model on the 
#WARING: Long process time 12-20 minutes with 300,000+ documents.

# tic = timeit.default_timer()

# lda = LatentDirichletAllocation(n_components = 10, random_state = None)
# lda.fit(vectorizer_10k['matrix'])

# toc = timeit.default_timer()
# print(str((toc - tic)/60) + " minutes")

5.208473869956409 minutes


In [22]:
#SAVES PICKLE OF ABOVE MODEL

# with open('lda_model_10k.pkl', 'wb') as f:
#     pickle.dump(lda, f)

In [35]:
#OPENS SAVED MODEL FROM PICKLE

del lda
with open('lda_model_10k.pkl', 'rb') as f:
    lda_10k = pickle.load(f)

In [36]:
print(len(lda_10k.components_[0]))
lda_10k.components_.shape

10000


(10, 10000)

## 5.2 Feature Count = 20k, Topic Count = 10

In [29]:
#Train the LDA model on the 
#WARING: Long process time 12-20 minutes with 300,000+ documents.

# tic = timeit.default_timer()

# lda = LatentDirichletAllocation(n_components = 10, random_state = None)
# lda.fit(vectorizer_20k['matrix'])

# toc = timeit.default_timer()
# print(str((toc - tic)/60) + " minutes")

5.450025906460359 minutes


In [31]:
#SAVES PICKLE OF ABOVE MODEL

# with open('lda_model_20k.pkl', 'wb') as f:
#     pickle.dump(lda, f)

In [33]:
with open('lda_model_20k.pkl', 'rb') as f:
    lda_20k = pickle.load(f)

In [34]:
print(len(lda_20k.components_[0]))
lda_20k.components_.shape

20000


(10, 20000)

# 6.0 Coherence

## 6.1 Coherence on 10K Word Model

In [37]:
#TIMED COHERENCE SCORES ON CURRENT MODEL (PER TOPIC):
tic = timeit.default_timer()

cohe_scores_10k = cohe_score_func(lda_10k, vectorizer_10k['matrix'], vectorizer_10k['features'])

toc = timeit.default_timer()
print("Time to process: " + str((toc - tic)/60) + " minutes")

Time to process: 0.03731844290159643 minutes


In [38]:
cohe_scores_10k

(-1.3200397814214684,
 [-1.654842282318023,
  -1.0572889302152175,
  -0.6017384438101522,
  -0.9406684796870864,
  -1.997324013178544,
  -0.9958162531509893,
  -2.1135252474382025,
  -0.3050544912208139,
  -1.940829085303355,
  -1.5933105878923015])

## 6.2 Coherence on 20K Word Model

In [40]:
#TIMED COHERENCE SCORES ON CURRENT MODEL (PER TOPIC):
tic = timeit.default_timer()

cohe_scores_20k = cohe_score_func(lda_20k, vectorizer_20k['matrix'], vectorizer_20k['features'])

toc = timeit.default_timer()
print("Time to process: " + str((toc - tic)/60) + " minutes")

Time to process: 0.03995933444239199 minutes


In [41]:
cohe_scores_20k

(-1.1993934294426005,
 [-1.0741288237920654,
  -1.0976291344830973,
  -1.4576883592867196,
  -1.4113043091836077,
  -2.7087796206458137,
  -1.6760128266425274,
  -1.1237839061285002,
  -0.43078392972505714,
  -0.6266980540902165,
  -0.38712533044840103])

# 7.0 Topic Exploration

In [52]:
#CREATES DFs OF PERTINENT POSTS WITHOUT LEMMATIZATION FOR APPENDING TOPIC INFO. 

df = build_and_simplify_dataframe(['gme'])

df_10k = build_and_lemmatize_text_list(df, lemmatize = False, save_string = None)

df_20k = df_10k

del df

Dateframe size prior to dropping stuff: 273327


## 7.1 Topic Exploration on 10K Model

In [43]:
print(df_10k.shape)
df_10k.head()

(94039, 3)


Unnamed: 0,id,title,selftext
1,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...
3,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...
5,krnthg,ICR conference (11th Jan),Any speculation or ideas on what Gamestop migh...
7,kuo3w1,"GME is FINALLY going to the moon, this technic...","After some downwards movement, I think everyb..."
9,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",Guysss... we retards have fantasized a long ti...


In [44]:
doc_topic_mat = lda_10k.transform(vectorizer_10k['matrix'])

In [45]:
print(doc_topic_mat.shape)

(94039, 10)


In [47]:
##ADD NEW COLUMNS TO DATAFRAME WITH TOPIC AND SCORES.

# df_10k.reset_index(inplace = True)
# df_10k['prime_topic'] = None
# df_10k['sec_topic'] = None
# df_10k['prime_score'] = None
# df_10k['sec_score'] = None
# df_10k['topic_scores'] = None
# for i in df_10k.index:
#     df_10k['prime_topic'][i] = np.argsort(doc_topic_mat[i])[::-1][0]
#     df_10k['sec_topic'][i] = np.argsort(doc_topic_mat[i])[::-1][1]
#     df_10k['prime_score'][i] = np.sort(doc_topic_mat[i])[::-1][0]
#     df_10k['sec_score'][i] = np.sort(doc_topic_mat[i])[::-1][1]
#     df_10k['topic_scores'][i] = json.dumps(list(doc_topic_mat[i]))
# df_10k.head()

In [5]:
#df_10k.to_csv('df_10k.csv', index = False)
df_10k = pd.read_csv('df_10k.csv')
df_10k

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
0,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...,6,Social Media Links (Many Reposts),5,0.493700,0.417392,"[0.011115580511175473, 0.011114949546913212, 0..."
1,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...,6,Social Media Links (Many Reposts),9,0.877783,0.090211,"[0.004000481503935025, 0.004001206442958103, 0..."
2,krnthg,ICR conference (11th Jan),Any speculation or ideas on what Gamestop migh...,8,News and Earnings Reports,3,0.579641,0.196707,"[0.0026319080133597192, 0.0026324819716669226,..."
3,kuo3w1,"GME is FINALLY going to the moon, this technic...","After some downwards movement, I think everyb...",3,"Opinions about Government, 'shills', troll pos...",6,0.459412,0.397844,"[0.004166976999316615, 0.004167743853592299, 0..."
4,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",Guysss... we retards have fantasized a long ti...,8,News and Earnings Reports,1,0.854413,0.137253,"[0.001041753250742906, 0.13725262819231285, 0...."
...,...,...,...,...,...,...,...,...,...
94034,rt0gj1,REVERSE REPO = APES RETIREMENT,This is basically excess cash being deposited ...,9,Trading Rules and Regulations,3,0.359998,0.257014,"[0.04491219609047314, 0.14042660237504867, 0.0..."
94035,rt21tk,"Last of year purchase, another XX at limit ord...",Figured GME is going to close sub-$150 for the...,4,Shorting and Sharing Financial Data,1,0.345670,0.317468,"[0.002941803834197981, 0.31746832378339757, 0...."
94036,rt3e78,"Nancy Pelosee Posts Are This Weekend's FUD, Ch...","Disclaimer: I am smoother than skippy, this is...",3,"Opinions about Government, 'shills', troll pos...",0,0.884826,0.108772,"[0.10877236673539148, 0.0008002279861051584, 0..."
94037,rt4thl,What was the best day for GME hodlers in 2021?...,"Today. December 31, 2021. Because today prov...",1,"'Diamond Hands', Buy and Hold",8,0.442506,0.338910,"[0.0037039429779218334, 0.4425056825872468, 0...."


In [65]:
top_terms = get_top_terms(lda_10k.components_, vectorizer_10k['features'])

In [140]:
topic_num = 0
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['share', 'account', 'transfer', 'broke', 'fidelity', 'drs', 'robinhood', 'gme', 'use', 'trade'] 

TITLES:
0 .      SAXO - (FOP) transfer irregularities + SAXO GME share ownership (or lack off)
1 .      WEBULL to FIDELITY Transfer...PLEASE READ IF YOU ARE ON WEBULL AND CONSIDERING TRANSFERRING
2 .      If you have Robinhood, make sure you have a cash account!
3 .      For all you RH users procrastinating from switching to Fidelity due to FOMO of that MOASS
4 .      The DRS list. For those who wish to direct register, apes salute you!
5 .      Transferring to computershare via DRS out of Drivewealth or other that use the Drivewealth platform.
6 .      Jan Ape finally feeding the Bot, going through another $100+ drop has made me more pissed than ever !
7 .      Everyone talks about „I have time, I could wait another 10 years for MOASS“ but…
8 .      Voting Update for Canadian Apes 🇨🇦
9 .      CALL WEBULL TO VOTE!! They “Already Sent” Proxy Info

TEXTS:
TEXT  0 :
 On September

Unnamed: 0,level_0,index,id,title,selftext,prime_topic,sec_topic,prime_score,sec_score,topic_scores
91666,91666,255710,qb9eg9,SAXO - (FOP) transfer irregularities + SAXO GM...,On September 27th I requested to SAXO a transf...,0,3,0.997187,0.000313,"[0.997186796482052, 0.00031255984182120233, 0...."
82220,82220,195051,n6h25o,WEBULL to FIDELITY Transfer...PLEASE READ IF Y...,**PLEASE READ IF YOU ARE ON WEBULL AND CONSIDE...,0,1,0.997049,0.000328,"[0.997048560630531, 0.0003279755034862047, 0.0..."
4648,4648,13689,lh77lb,"If you have Robinhood, make sure you have a ca...","Robinhood will lend out your shares, if you ha...",0,4,0.996051,0.000439,"[0.9960513781968555, 0.0004387061214401881, 0...."
78202,78202,182005,mvx6d7,For all you RH users procrastinating from swit...,I have put together a little guide on my exper...,0,8,0.995774,0.00047,"[0.995773798648836, 0.0004696147803255549, 0.0..."
90165,90165,245313,pn0n52,The DRS list. For those who wish to direct reg...,Inspired by Apes I'm trying to get a list toge...,0,4,0.995287,0.000524,"[0.9952870772587568, 0.0005236736306835357, 0...."
91144,91144,251463,q0bypm,Transferring to computershare via DRS out of D...,I've seen a few posts asking if it's possible ...,0,5,0.994609,0.000599,"[0.9946094589079846, 0.0005989869929108661, 0...."
93559,93559,270006,rgbn47,"Jan Ape finally feeding the Bot, going through...",This is just a text post to make sure I'm coun...,0,4,0.994116,0.000654,"[0.9941164034418929, 0.0006537252360340678, 0...."
94018,94018,273178,rshxpx,"Everyone talks about „I have time, I could wai...",I don‘t want Kenny to bathe in wealth for anot...,0,8,0.994078,0.000658,"[0.9940783181949642, 0.0006580127674348, 0.000..."
81786,81786,193795,n5kb6r,Voting Update for Canadian Apes 🇨🇦,Hello fellow Canadian Apes!!\n\nI posted a few...,0,3,0.993706,0.000699,"[0.9937055319157078, 0.0006994149293015755, 0...."
80185,80185,188709,n155kp,CALL WEBULL TO VOTE!! They “Already Sent” Prox...,Making this post for awareness. I called Webul...,0,8,0.993477,0.000725,"[0.9934766399966268, 0.0007248536381742045, 0...."


In [112]:
topic_dict = {}
topic_dict[0] = "Trading Accounts"

In [190]:
topic_num = 1
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['ape', 'just', 'hold', 'buy', 'fuck', 'like', 'gme', 'know', 'make', 'want'] 

TITLES:
0 .      when hedgies & cnbc say sell!!!!!!!!
1 .      Fug you all MODS !!! GME TO THE MOON
2 .      Why the head fucks will never understand...
3 .      Ya betta hold, b*tch
4 .      Just got off the phone with E-Trade on hold for 2 hours. GameStop shares on the way to CS!!! Will post an update when they arrive
5 .      Anyone redoing lyrics to songs to fit GME?
6 .      I say Diamond Hands You say Hold!
7 .      Don’t turn into what we’re fighting against
8 .      Ladies and gents, u/oaf_king posted about how we can prepare ourselves to have diamond hands while we watch our portfolios rise to amounts of money we’ve never seen in our lives and it gave me an idea.
9 .      buy, hold, shrug, yawn, sigh, buy, hold, shrug, yawn, sigh

TEXTS:
TEXT  0 :
 🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice
🙈🙉🙊


Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
24334,m0sa2x,when hedgies & cnbc say sell!!!!!!!!,🙈🙉🙊\nbuy hold not financial advice\n🙈🙉🙊\nbuy h...,1,"'Diamond Hands', Buy and Hold",5,0.99795,0.000228,"[0.00022779900671706776, 0.997949834275542, 0...."
74575,mkpbtm,Fug you all MODS !!! GME TO THE MOON,GME to the moon 🚀 GME to the moon 🚀 GME to the...,1,"'Diamond Hands', Buy and Hold",4,0.997,0.000333,"[0.0003333585319572217, 0.99699972356817, 0.00..."
62895,me56kd,Why the head fucks will never understand...,Because they cannot put themselves in our shoe...,1,"'Diamond Hands', Buy and Hold",0,0.996896,0.000345,"[0.0003449180642589487, 0.9968963438250638, 0...."
47026,m8v9qi,"Ya betta hold, b*tch",You want a diamond body? \n\nYou want a Bugatt...,1,"'Diamond Hands', Buy and Hold",9,0.996885,0.000346,"[0.0003460548598716875, 0.9968853128828876, 0...."
93222,rbbhoy,Just got off the phone with E-Trade on hold fo...,Diamond hands Diamond hands Diamond hands Diam...,1,"'Diamond Hands', Buy and Hold",5,0.996853,0.00035,"[0.0003496504772256189, 0.9968531303868823, 0...."
64019,meq5w9,Anyone redoing lyrics to songs to fit GME?,So I've recently found myself listening more c...,1,"'Diamond Hands', Buy and Hold",8,0.996564,0.000382,"[0.00038172775149846334, 0.9965641005654201, 0..."
11561,lrn7x7,I say Diamond Hands You say Hold!,DIAMOND HANDS! HOLD!!!! DIAMOND HANDS! HOLD!!!...,1,"'Diamond Hands', Buy and Hold",5,0.996471,0.000392,"[0.00039216042168949686, 0.9964705510829844, 0..."
43879,m82u08,Don’t turn into what we’re fighting against,I asked by sister what she’s gonna do with the...,1,"'Diamond Hands', Buy and Hold",8,0.995774,0.00047,"[0.00046956907896941486, 0.9957737388111695, 0..."
23627,m055lp,"Ladies and gents, u/oaf_king posted about how ...",One part of his post really stood out to me. H...,1,"'Diamond Hands', Buy and Hold",3,0.995754,0.000472,"[0.0004717989275821544, 0.995753916807501, 0.0..."
39516,m6dmsm,"buy, hold, shrug, yawn, sigh, buy, hold, shrug...","buy, hold, shrug, yawn, sigh, buy, hold, shrug...",1,"'Diamond Hands', Buy and Hold",5,0.995544,0.000495,"[0.0004950699325340571, 0.9955444689688675, 0...."


In [122]:
topic_dict[1] = "'Diamond Hands', Buy and Hold"

In [192]:
topic_num = 2
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['com', 'http', 'reddit', 'reddit com', 'www', 'http www', 'www reddit', 'www reddit com', 'http www reddit', 'comments'] 

TITLES:
0 .      My Honest Thoughts on GME
1 .      r/GME Megathread for Sunday - August 15, 2021
2 .      r/GME Megathread for Saturday - August 14, 2021
3 .      r/GME Megathread for Sunday - July 25, 2021
4 .      r/GME Megathread for Friday - August 13, 2021
5 .      r/GME Megathread for Monday - July 26, 2021
6 .      r/GME Megathread for Wednesday - July 28, 2021
7 .      drives me crazy
8 .      r/GME Megathread for Tuesday - May 25, 2021
9 .      r/GME Megathread for Thursday - September 23, 2021

TEXTS:
TEXT  0 :
 I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
63759,melyvr,My Honest Thoughts on GME,I like the stock.\nI like the stock.\nI like t...,2,Reddit MOD Announcement,1,0.999896,1.2e-05,"[1.157723200325687e-05, 1.157856146668522e-05,..."
89298,p4o20e,"r/GME Megathread for Sunday - August 15, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.99905,0.000106,"[0.00010552120978018204, 0.0001055028322966220..."
89275,p42jtt,"r/GME Megathread for Saturday - August 14, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.99905,0.000106,"[0.00010552120978018204, 0.0001055028322966220..."
88561,or64qi,"r/GME Megathread for Sunday - July 25, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,9,0.998953,0.000116,"[0.00011631150167035335, 0.0001163021507138491..."
89245,p3hke8,"r/GME Megathread for Friday - August 13, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,3,0.998917,0.00012,"[0.00012036383730022555, 0.0001203574202903596..."
88589,orsoft,"r/GME Megathread for Monday - July 26, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.998909,0.000121,"[0.00012127401555610394, 0.0001212328343481597..."
88643,ot400q,"r/GME Megathread for Wednesday - July 28, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.998898,0.000122,"[0.00012245874351186385, 0.0001224150558357531..."
9018,lob98y,drives me crazy,I have been trying to draw attention to the fa...,2,Reddit MOD Announcement,0,0.998723,0.000142,"[0.00014188164458064874, 0.0001418622441493064..."
84419,nkikkc,"r/GME Megathread for Tuesday - May 25, 2021","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,8,0.99856,0.00016,"[0.00016002143636936126, 0.0001600383470246453..."
90646,ptoude,"r/GME Megathread for Thursday - September 23, ...","This is a place to discuss technical analysis,...",2,Reddit MOD Announcement,0,0.998555,0.000161,"[0.00016056737365144515, 0.0001605290251453965..."


In [113]:
topic_dict[2] = "Reddit MOD Announcement"

In [194]:
topic_num = 3
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['post', 'people', 'know', 'ape', 'just', 'make', 'like', 'think', 'dd', 'gme'] 

TITLES:
0 .      DD (A.K.A Due Dilligence) information on how to better recognize bot/shill/troll comments and intentions.
1 .      I'm Stepping Down As Moderator & More
2 .      We must learn to encourage our politicians to do the right thing and stand behind them!
3 .      Can we please limit use of the word SHILL to actual shills?
4 .      Please take a moment to read this post. It might just make r/GME an even better place for all!
5 .      Shills have been adapting right in front of our faces and it seems many haven't yet caught on.
6 .      Proposal - PLAN OF COMBAT - SHILLS AND SH*T POSTS
7 .      Guide to attacking a subreddit community
8 .      My Message
9 .      Discussion on Government Intervention

TEXTS:
TEXT  0 :
 Howdy everyone,

I'm sure I don't have to be the one to tell you that we have seen a very large uptick in manipulative activity after Friday. Once again it seems like 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
9100,lofp6k,DD (A.K.A Due Dilligence) information on how t...,"Howdy everyone,\n\nI'm sure I don't have to be...",3,"Opinions about Government, 'shills', troll pos...",0,0.998828,0.00013,"[0.00013024818447640406, 0.0001302390355845810..."
77323,msgp0z,I'm Stepping Down As Moderator & More,I joined GME in January like most of us after ...,3,"Opinions about Government, 'shills', troll pos...",9,0.998582,0.000158,"[0.00015752878589032963, 0.0001575166213045705..."
82192,n6fcab,We must learn to encourage our politicians to ...,LISTEN! For decades our politicians have been...,3,"Opinions about Government, 'shills', troll pos...",0,0.998398,0.000178,"[0.00017802817702353117, 0.0001779745426889831..."
66732,mgcsbx,Can we please limit use of the word SHILL to a...,TL;DR not everyone you disagree with is a shil...,3,"Opinions about Government, 'shills', troll pos...",5,0.998182,0.000202,"[0.00020208002393807506, 0.0002020625900147287..."
65726,mftlgt,Please take a moment to read this post. It mig...,Greetings apes! 🍌\n\nI'd like to start by sayi...,3,"Opinions about Government, 'shills', troll pos...",6,0.997897,0.000234,"[0.0002336972453290241, 0.00023370776056136607..."
19423,lwh29u,Shills have been adapting right in front of ou...,"Hello again everyone,\n\nI honestly thought th...",3,"Opinions about Government, 'shills', troll pos...",1,0.997841,0.00024,"[0.00023985552398235398, 0.0002398767810494489..."
66824,mgeh52,Proposal - PLAN OF COMBAT - SHILLS AND SH*T POSTS,"Dear Fellow Apes,\n\nFirst, I would like to st...",3,"Opinions about Government, 'shills', troll pos...",0,0.997457,0.000283,"[0.0002826436657128661, 0.00028259702257585817..."
75420,mm3c0p,Guide to attacking a subreddit community,# Technique #1 - 'TOPIC DILUTION'\n\n**Aim:**...,3,"Opinions about Government, 'shills', troll pos...",0,0.997289,0.000301,"[0.00030133298182698817, 0.0003012712590386408..."
75329,mlgofd,My Message,Hello everyone.\n\nThis is to clarify my perso...,3,"Opinions about Government, 'shills', troll pos...",1,0.997272,0.000303,"[0.0003030890802241014, 0.00030312837169076096..."
88273,omzvsf,Discussion on Government Intervention,"First of all, I am not a shill. I want this bo...",3,"Opinions about Government, 'shills', troll pos...",9,0.996928,0.000341,"[0.0003413874536410046, 0.0003413604194774321,..."


In [114]:
topic_dict[3] = "Opinions about Government, 'shills', troll posts, etc."

In [196]:
topic_num = 4
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['gme', 'share', 'short', 'volume', '000', 'day', 'etf', '2021', 'data', '10'] 

TITLES:
0 .      Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n kJoin Su p er s to n k
1 .      7/14 After hours after math
2 .      Shortable Stock Availability 3/15
3 .      JACKED TO THE TITS? Relieve some stress with bubble wrap
4 .      3-16-2021 - My wild stab at Short Sells
5 .      A GME FTD Price Model based on T+35
6 .      3-18-2021 -- Possible Pressure for the Big Squeeze
7 .      The short sale volume percent (not short interest) for GME is 66% on Aug 16, 2021 🦍💪🚀💎🙌
8 .      GME and ETF's that contain GME short volume through 3/1/2021.
9 .      Alright Apes make of this info as you will

TEXTS:
TEXT  0 :
 Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n kJoin Su p er s to n k J

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
74820,mktn6a,Join Su p er s to n k Join Su p er s to n k Jo...,Join Su p er s to n k Join Su p er s to n k Jo...,4,Shorting and Sharing Financial Data,3,0.999742,2.9e-05,"[2.8719171316163066e-05, 2.8719417656266848e-0..."
88178,olufb9,7/14 After hours after math,"Hi All, trying to gain a wrinkle here. feel so...",4,Shorting and Sharing Financial Data,1,0.999185,9.1e-05,"[9.058979278362576e-05, 9.063561299762814e-05,..."
38000,m5zs92,Shortable Stock Availability 3/15,"Hi everyone, after today's action in both GME ...",4,Shorting and Sharing Financial Data,0,0.998611,0.000154,"[0.0001544016927238291, 0.00015434011502002203..."
69420,mhuvt0,JACKED TO THE TITS? Relieve some stress with b...,HOW JACKED ARE YOU?\n\n>!pop!< >!pop!< >!pop!<...,4,Shorting and Sharing Financial Data,1,0.998594,0.000156,"[0.00015625081340502528, 0.0001562530897168915..."
40029,m6kb1o,3-16-2021 - My wild stab at Short Sells,"Not sure if this helps anyone, but this does k...",4,Shorting and Sharing Financial Data,0,0.997398,0.000289,"[0.00028912966374745875, 0.0002890760742125976..."
87655,oeqj2n,A GME FTD Price Model based on T+35,**TL;DR**\n\n&nbsp;\n\nFTDs will deliver the t...,4,Shorting and Sharing Financial Data,7,0.996559,0.00311,"[4.136008629031401e-05, 4.1362181745610106e-05..."
43910,m838os,3-18-2021 -- Possible Pressure for the Big Squ...,Yep. Ran the app again. Here are today's numbe...,4,Shorting and Sharing Financial Data,6,0.995813,0.000465,"[0.00046523249427634386, 0.0004651634690636653..."
89352,p5usob,The short sale volume percent (not short inter...,The short sale volume percent (not short inter...,4,Shorting and Sharing Financial Data,5,0.995755,0.000472,"[0.0004717067832894187, 0.0004717060994764702,..."
18498,lvqy8b,GME and ETF's that contain GME short volume th...,"Apes, \n\n\nHope you had a great start to Mar...",4,Shorting and Sharing Financial Data,2,0.995544,0.000495,"[0.0004951142671282871, 0.0004951579111292803,..."
72589,mk4htj,Alright Apes make of this info as you will,So here are the totals for calls and puts prov...,4,Shorting and Sharing Financial Data,1,0.995476,0.000503,"[0.0005026666960183959, 0.0005027114206634459,..."


In [115]:
topic_dict[4] = "Shorting and Sharing Financial Data"

In [198]:
topic_num = 5
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['share', 'price', 'short', 'sell', 'buy', 'stock', 'gme', 'squeeze', 'market', 'just'] 

TITLES:
0 .      BUY STOCKS, NOT OPTIONS
1 .      Correct me if I'm wrong, but I think you're just as likely to get $1B for your shares as you are to get $10K
2 .      Stay relax; there is no way that sudden dip not from short-selling.
3 .      Covering the real shorts to fake low SI
4 .      Retarded Ape helping fellow apes understand options
5 .      Alternate outcome to the 3/19 DD
6 .      AH Price Action Explained
7 .      ⛔ IMPORTANT ⛔ ABOUT SELLING ORDER TYPE
8 .      ELI5: How does a stock get bought back "multiple times"? How could the price go to 1 million?
9 .      What happens when there's a crazy imbalance in bid/ask?

TEXTS:
TEXT  0 :
 Before I start, I want to say that this is not financial advice and im not any kind of advisor. 

This is purely what I will do:

&#x200B;

BUY STOCKS, NOT OPTIONS BUY STOCKS, NOT OPTIONS BUY STOCKS, NOT OPTIONS BUY STOCKS, NOT OPTIONS BUY 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
48295,m9k0mw,"BUY STOCKS, NOT OPTIONS","Before I start, I want to say that this is not...",5,General Advice/Questions about Stocks,1,0.99895,0.000117,"[0.0001166936979066522, 0.00011669922455722537..."
47746,m9al7b,"Correct me if I'm wrong, but I think you're ju...","**Obviously not financial advice, I'm pretty d...",5,General Advice/Questions about Stocks,3,0.997058,0.000327,"[0.0003268582101789805, 0.0003268865819613594,..."
37614,m5uei5,Stay relax; there is no way that sudden dip no...,"Not financial advice, only personal thought on...",5,General Advice/Questions about Stocks,1,0.996808,0.000355,"[0.000354642017525354, 0.0003547585463873848, ..."
32908,m3mnjp,Covering the real shorts to fake low SI,"I am not a ceasar-intellect level ape, but the...",5,General Advice/Questions about Stocks,4,0.995927,0.000453,"[0.0004525542047425369, 0.00045257204873899563..."
8923,lo5z3p,Retarded Ape helping fellow apes understand op...,"So here to a fun week, but I'd like to set the...",5,General Advice/Questions about Stocks,3,0.995693,0.000479,"[0.000478581407461485, 0.0004786356907423595, ..."
17361,lupbze,Alternate outcome to the 3/19 DD,There's an alternative that doesn't involve a ...,5,General Advice/Questions about Stocks,4,0.995609,0.000488,"[0.00048788024042523283, 0.0004879037556376666..."
53129,mboq01,AH Price Action Explained,Not a financial advisor. Do your own due dili...,5,General Advice/Questions about Stocks,1,0.995287,0.000524,"[0.000523652531126545, 0.000523688253456997, 0..."
34135,m49pmv,⛔ IMPORTANT ⛔ ABOUT SELLING ORDER TYPE,I DON'T CARE WHETHER YOU SELL AT 100K OR 500K ...,5,General Advice/Questions about Stocks,4,0.995212,0.000532,"[0.0005320162092596363, 0.0005320098542185532,..."
63411,meg4r9,"ELI5: How does a stock get bought back ""multip...",Let's say that hedge funds are on the hook for...,5,General Advice/Questions about Stocks,1,0.994971,0.000559,"[0.0005587689941135265, 0.0005588393259574535,..."
52322,mbhyho,What happens when there's a crazy imbalance in...,"Okay, so something I'm curious about in unders...",5,General Advice/Questions about Stocks,9,0.994971,0.000559,"[0.0005587835822653561, 0.000558855358635093, ..."


In [116]:
topic_dict[5] = "General Advice/Questions about Stocks"

In [200]:
topic_num = 6
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['http', 'com', 'www', 'http www', 'gamestop', 'html', 'twitter', 'video', 'twitter com', 'youtube'] 

TITLES:
0 .      My totally real positions for the algorithm
1 .      List of all GameStop social pages - One day to market open, time to jump on those social media pages and rate those mobile apps!
2 .      Bored on Saturday with market closed? Pump those social GameStop & Family channels while you wait to get them tendies!
3 .      While waiting for the market to open, make sure to drop into GameStop social channels and leave them likes, comments and buy some merchandise! Links for all the social pages inside
4 .      One day till we're back in game - till then, you can boost GameStop social pages, listed inside!
5 .      It's weekend! Time to boost these GameStop social medias while you wait for Monday to come 🚀
6 .      Great day today! Reminder to drop into GameStop social channels
7 .      WHAT EVERYONE NEEDS TO DO RIGHT NOW, full social media list.
8 .      Support 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
50611,matoph,My totally real positions for the algorithm,"69,420@6969 in $CUM 12345@4958 in $FUK 19479@9...",6,Social Media Links (Many Reposts),4,0.999552,5e-05,"[4.9751259707176797e-05, 4.975203662759475e-05..."
64577,mf45mc,List of all GameStop social pages - One day to...,\n\nWill re-post this once in a while so that...,6,Social Media Links (Many Reposts),8,0.998788,0.000135,"[0.00013462344849898069, 0.0001346167635409512..."
63609,mejo5l,Bored on Saturday with market closed? Pump tho...,Will re-post this once in a while so that all ...,6,Social Media Links (Many Reposts),8,0.998788,0.000135,"[0.00013462344849898069, 0.0001346167635409512..."
36154,m5i2i1,"While waiting for the market to open, make sur...",Will re-post this once in a while so that all...,6,Social Media Links (Many Reposts),0,0.998661,0.000149,"[0.0001488486131805043, 0.00014883714837285174..."
35316,m5052p,"One day till we're back in game - till then, y...",Will re-post this once in a while so that all ...,6,Social Media Links (Many Reposts),0,0.998661,0.000149,"[0.0001488486131805043, 0.00014883714837285174..."
33905,m447w5,It's weekend! Time to boost these GameStop soc...,**The numbers barely moved / apps actually wen...,6,Social Media Links (Many Reposts),0,0.998658,0.000149,"[0.00014908626171832558, 0.0001490711064033014..."
29825,m2bb24,Great day today! Reminder to drop into GameSto...,**The numbers barely moved / apps actually wen...,6,Social Media Links (Many Reposts),0,0.998648,0.00015,"[0.0001502050462391299, 0.00015018971460856927..."
62642,me0f6q,"WHAT EVERYONE NEEDS TO DO RIGHT NOW, full soci...",This list was put together by u/Rabus I am jus...,6,Social Media Links (Many Reposts),0,0.998622,0.000153,"[0.00015318719935441283, 0.0001531750131048230..."
26100,m1k8e3,Support GameStop - drop in their Social media ...,Will re-post this once in a while so that all ...,6,Social Media Links (Many Reposts),0,0.998621,0.000153,"[0.00015320311179929533, 0.0001531772960722894..."
24780,m14qbj,GameStop Social media list / apps,"I see these popping up every now and then, but...",6,Social Media Links (Many Reposts),0,0.998105,0.000211,"[0.00021060252923848512, 0.0002105650636896353..."


In [117]:
topic_dict[6] = "Social Media Links (Many Reposts)"

In [202]:
topic_num = 7
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['png', 'x200b', 'http', 'format', 'redd', 'auto', 'width', 'preview', 'webp', 'preview redd'] 

TITLES:
0 .      SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR
1 .      Taken from twitter! DD on the crash yesterday and the coming days/weeks/months
2 .      HOLY SMOKES HOLY MOLY
3 .      When GME hit 100k can we all go to Citadel & Merlin and drink champagne, like these guys.
4 .      PICTURES!!! 💎🙌 ARE NOT SELLING! During the last 10 minutes of the trading day on Thursday, March 4th, this was happening. The matching buy and sell increments shows us that WE are not selling. The price is 100% PSYCHOLOGICAL! 45, 46, 47, 51, 100, 300...this is irregular. This is to drop the price.
5 .      South Korea) I was with you even when it was 40 dollars.
6 .      GME BT DUMP - 3/29
7 .      Look at this cute little thing...
8 .      GME Large Bull Fla

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
30750,m2p7ya,SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SS...,SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SS...,7,Posts of Memes/Images,4,0.998048,0.000217,"[0.00021691973969952176, 0.0002169230798379255..."
59010,md01x9,Taken from twitter! DD on the crash yesterday ...,"not my work, not financial advice, this ape li...",7,Posts of Memes/Images,1,0.996691,0.000368,"[0.0003676585733030681, 0.000367716809812659, ..."
92084,qkijx3,HOLY SMOKES HOLY MOLY,HOLY SMOKES HOLY MOLY HOLY SMOKES HOLY MOLY HO...,7,Posts of Memes/Images,1,0.99313,0.000763,"[0.0007633612660886593, 0.0007634824716446396,..."
24830,m15e9x,When GME hit 100k can we all go to Citadel & M...,&#x200B;\n\nhttps://preview.redd.it/bayo9fwswz...,7,Posts of Memes/Images,1,0.992857,0.000794,"[0.0007936692678393034, 0.0007939562190377511,..."
21722,ly7j3x,PICTURES!!! 💎🙌 ARE NOT SELLING! During the las...,**EDIT: Simplified.**\n\n**🦍🦍🦍's are💎🙌ing the ...,7,Posts of Memes/Images,1,0.992622,0.00082,"[0.0008196979911496042, 0.0008198998229979559,..."
79461,mze7el,South Korea) I was with you even when it was 4...,&#x200B;\n\nhttps://preview.redd.it/d4bpsuialm...,7,Posts of Memes/Images,1,0.992622,0.00082,"[0.0008197418173753819, 0.0008198537506973497,..."
66429,mg5rcq,GME BT DUMP - 3/29,Daily GME BT DUMP. Let me know if you want an...,7,Posts of Memes/Images,3,0.992562,0.000827,"[0.0008264952692626177, 0.0008265084914317405,..."
78601,mwxpbw,Look at this cute little thing...,&#x200B;\n\nhttps://preview.redd.it/kcom89nnxx...,7,Posts of Memes/Images,3,0.992561,0.000827,"[0.0008265030047434702, 0.0008266104560367176,..."
67241,mglc67,GME Large Bull Flag - Possible breakout,Just noticed there was a large bull flag and w...,7,Posts of Memes/Images,3,0.991587,0.000935,"[0.0009347249402753098, 0.0009347545265282976,..."
37197,m5rw2w,Today´s Price Action of GME explained in Memes...,**GME Opening Price: $277.52 / 221.30€**\n\nh...,7,Posts of Memes/Images,2,0.990898,0.005415,"[0.00046083690197170913, 0.0004608356109373631..."


In [118]:
topic_dict[7] = "Posts of Memes/Images"

In [204]:
topic_num = 8
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['gamestop', 'company', 'game', 'cohen', 'year', 'new', 'news', 'board', 'vote', 'ryan'] 

TITLES:
0 .      4 Additional Board Members Expected to leave in June. Possible additional changes to senior executives.
1 .      GameStop Releases 2021 Proxy Statement
2 .      GAMESTOP NEWS RELEASE
3 .      GameStop Appoints Chief Growth Officer Announces Two Additional Executive Hires to Support Transformation
4 .      GameStop appoints Chief Growth Officer Elliot Wilke 30th Mar 2021 plus 2 VPs 🚀🚀
5 .      RYAN COHEN FOR BOARD DIRECTOR! 9TH JUNE ANNUAL MEETING!
6 .      Earnings up on Gamestop Website - NEW ROCKSTAR COO!!!!
7 .      GameStop Provides Corporate Governance Update
8 .      GameStop CEO George Sherman: “Goal: Leading Global Omni-Channel Retailer For All Things Gaming and Entertainment” - March 11, 2021
9 .      GME Q2 Earnings Report

TEXTS:
TEXT  0 :
 [Buried in the 10-K that Gamestop released today, under ITEM 9B Other Information](https://news.gamestop.com/node/1866

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
53849,mbrjug,4 Additional Board Members Expected to leave i...,[Buried in the 10-K that Gamestop released tod...,8,News and Earnings Reports,4,0.997281,0.000302,"[0.00030214216530629294, 0.0003021353628825133..."
78487,mwok26,GameStop Releases 2021 Proxy Statement,GameStop has released their anticipated **2021...,8,News and Earnings Reports,4,0.996326,0.000408,"[0.00040825506513390803, 0.0004082122736642851..."
10625,lqtr5c,GAMESTOP NEWS RELEASE,[https://news.gamestop.com/news-releases/news-...,8,News and Earnings Reports,0,0.995336,0.000518,"[0.0005182607687988424, 0.0005181909883209347,..."
66749,mgd6n4,GameStop Appoints Chief Growth Officer Announc...,Thought I'd share...\n\nGLOBENEWSWIRE 3:45 AM ...,8,News and Earnings Reports,4,0.995134,0.000541,"[0.000540604481157613, 0.0005406445377260532, ..."
66821,mgeggn,GameStop appoints Chief Growth Officer Elliot ...,https://news.gamestop.com/news-releases/news-r...,8,News and Earnings Reports,1,0.994857,0.000572,"[0.0005714711943696559, 0.0005715205213696621,..."
75554,mmprm9,RYAN COHEN FOR BOARD DIRECTOR! 9TH JUNE ANNUAL...,Press release from GameStop Website:\n\n>GRAPE...,8,News and Earnings Reports,1,0.994797,0.000578,"[0.0005780685169041128, 0.0005781880871137295,..."
53060,mboebt,Earnings up on Gamestop Website - NEW ROCKSTAR...,Q4 EPS 1.34 Adjusted\n\nFY2020 EPS (-)2.14 Adj...,8,News and Earnings Reports,6,0.994512,0.00061,"[0.00060976876218356, 0.000609771630302634, 0...."
23785,m0fbta,GameStop Provides Corporate Governance Update,https://gamestop.gcs-web.com/news-releases/new...,8,News and Earnings Reports,6,0.994267,0.000637,"[0.0006369783447214565, 0.0006369983286139198,..."
35035,m4v9lt,GameStop CEO George Sherman: “Goal: Leading Gl...,"\n[CEO’s of GameStop, FansUnite, ESE and Draft...",8,News and Earnings Reports,2,0.993568,0.000716,"[0.0007144421373019484, 0.0007144363168906168,..."
90044,pkiupk,GME Q2 Earnings Report,https://investor.gamestop.com/news-releases/ne...,8,News and Earnings Reports,9,0.992306,0.000855,"[0.0008548626224087271, 0.0008548079534363822,..."


In [119]:
topic_dict[8] = "News and Earnings Reports"

In [206]:
topic_num = 9
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df_10k[df_10k['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['http', 'market', 'www', 'http www', 'short', 'sec', 'com', 'trade', 'fund', 'security'] 

TITLES:
0 .      It doesn't appear that DTCC has "insurance" (at least in the way we think of it); it has a "loss allocation waterfall"
1 .      Not on the moon, past the moon?
2 .      Self-Regulatory Organizations; Proposed Rule Changes: National Securities Clearing Corp.
3 .      Legality of Turning off the Buy Button
4 .      NSSC-002 and NSSC-801 Update
5 .      Today's hot shit 7/14: New rules SR-DTC-2021-013, SR-DTC-2021-011, SR-DTC-2021-010
6 .      Love it when new SEC filings have to update their Risk and Recover plan during these times
7 .      Ongoing Continuation of DTC’s “Prepare for the worst”, Recovery & Wind-down (R&W) Amendments.
8 .      Revise the Clearing Agency Investment Policy - New Rulings on DTCC?
9 .      New DTC-2021-003 - Can someone ELI5 it ?

TEXTS:
TEXT  0 :
 **TL;DR:** Someone who buys and hodls 💎🤲 *will* get the tendies, from *someone*, if the stock 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
83637,ndytmh,"It doesn't appear that DTCC has ""insurance"" (a...",**TL;DR:** Someone who buys and hodls 💎🤲 *will...,9,Trading Rules and Regulations,0,0.999209,8.8e-05,"[8.791106743943505e-05, 8.789746867588811e-05,..."
52455,mbjeh2,"Not on the moon, past the moon?",Yeah? Yeah? Yeah? Yeah? Yeah? Yeah? Yeah? Yeah...,9,Trading Rules and Regulations,1,0.998831,0.00013,"[0.00012987158159667668, 0.0001298814364913912..."
54535,mbyfw6,Self-Regulatory Organizations; Proposed Rule C...,"Digging around the internet, and found this do...",9,Trading Rules and Regulations,4,0.998797,0.000134,"[0.00013372003142883996, 0.0001337104509609044..."
78267,mw5lhs,Legality of Turning off the Buy Button,**Disclaimer**: This is a post to attempt to i...,9,Trading Rules and Regulations,3,0.997935,0.000229,"[0.00022940716431495728, 0.0002294092130204513..."
52196,mbgn48,NSSC-002 and NSSC-801 Update,I did not see a new post yet on this. \n\nThe ...,9,Trading Rules and Regulations,0,0.99736,0.000293,"[0.0002933878860356113, 0.0002933472107628876,..."
87965,ojxw28,Today's hot shit 7/14: New rules SR-DTC-2021-0...,"[SR-DTC-2021-013, Notice of Filing and Immedia...",9,Trading Rules and Regulations,8,0.996808,0.000355,"[0.0003546452664825956, 0.0003546226234554297,..."
54553,mbyrgm,Love it when new SEC filings have to update th...,Seems like there were some updates to the Reco...,9,Trading Rules and Regulations,1,0.996414,0.000399,"[0.0003984868725969722, 0.00039858352541691136..."
55337,mc76fz,Ongoing Continuation of DTC’s “Prepare for the...,Newly submitted rule filings for larger brains...,9,Trading Rules and Regulations,6,0.996,0.000445,"[0.00044449095235565134, 0.0004444535298959044..."
24246,m0qa9d,Revise the Clearing Agency Investment Policy -...,[https://www.dtcc.com/legal/sec-rule-filings.a...,9,Trading Rules and Regulations,4,0.995714,0.000476,"[0.0004762976917749859, 0.0004762360814171751,..."
40983,m71xnn,New DTC-2021-003 - Can someone ELI5 it ?,Link :[https://www.dtcc.com/legal/sec-rule-fil...,9,Trading Rules and Regulations,6,0.995287,0.000524,"[0.0005236326080836791, 0.0005236071491312681,..."


In [120]:
topic_dict[9] = "Trading Rules and Regulations"

In [123]:
topic_dict

{0: 'Trading Accounts',
 2: 'Reddit MOD Announcement',
 3: "Opinions about Government, 'shills', troll posts, etc.",
 4: 'Shorting and Sharing Financial Data',
 5: 'General Advice/Questions about Stocks',
 6: 'Social Media Links (Many Reposts)',
 7: 'Posts of Memes/Images',
 8: 'News and Earnings Reports',
 9: 'Trading Rules and Regulations',
 1: "'Diamond Hands', Buy and Hold"}

In [220]:
for i, key in enumerate(topic_dict.keys()):
    top = topic_dict[key]
    cohe = cohe_scores_10k[1][i]
    avg = df_10k[df_10k['prime_topic'] == key]['prime_score'].mean()
    count = df_10k[df_10k['prime_topic'] == key]['prime_score'].count()
    print("Topic ", key, " is assigned to ", count, " documents - ", top)
    print("Avg. probability: ", avg, ", Coherence: ", cohe, "\n")

Topic  0  is assigned to  6391  documents -  Trading Accounts
Avg. probability:  0.6752539513960284 , Coherence:  -1.654842282318023 

Topic  1  is assigned to  33728  documents -  'Diamond Hands', Buy and Hold
Avg. probability:  0.7254267229448293 , Coherence:  -1.0572889302152175 

Topic  3  is assigned to  18697  documents -  Opinions about Government, 'shills', troll posts, etc.
Avg. probability:  0.6774471010436743 , Coherence:  -0.6017384438101522 

Topic  4  is assigned to  3457  documents -  Shorting and Sharing Financial Data
Avg. probability:  0.6134465289956622 , Coherence:  -0.9406684796870864 

Topic  5  is assigned to  14331  documents -  General Advice/Questions about Stocks
Avg. probability:  0.6572868122738977 , Coherence:  -1.997324013178544 

Topic  6  is assigned to  3770  documents -  Social Media Links (Many Reposts)
Avg. probability:  0.6906755610958889 , Coherence:  -0.9958162531509893 

Topic  7  is assigned to  4541  documents -  Posts of Memes/Images
Avg. pro

In [221]:
df_10k['topic'] = df_10k['prime_topic'].map(topic_dict)

In [179]:
df_10k = df_10k[['id', 'title', 'selftext', 'prime_topic', 'topic',
       'sec_topic', 'prime_score', 'sec_score', 'topic_scores']]

In [222]:
#df_10k.to_csv('df_10k.csv', index = False)
pd.read_csv('df_10k.csv')

## 7.2 Topic Exploration on 20K Model

In [19]:
print(df_20k.shape)
df_20k.head()

NameError: name 'df_20k' is not defined

In [57]:
doc_topic_mat = lda_20k.transform(vectorizer_20k['matrix'])

In [58]:
print(doc_topic_mat.shape)

(94039, 10)


In [61]:
##ADD NEW COLUMNS TO DATAFRAME WITH TOPIC AND SCORES.

# df_20k.reset_index(inplace = True)
# df_20k['prime_topic'] = None
# df_20k['sec_topic'] = None
# df_20k['prime_score'] = None
# df_20k['sec_score'] = None
# df_20k['topic_scores'] = None
# for i in df_20k.index:
#     df_20k['prime_topic'][i] = np.argsort(doc_topic_mat[i])[::-1][0]
#     df_20k['sec_topic'][i] = np.argsort(doc_topic_mat[i])[::-1][1]
#     df_20k['prime_score'][i] = np.sort(doc_topic_mat[i])[::-1][0]
#     df_20k['sec_score'][i] = np.sort(doc_topic_mat[i])[::-1][1]
#     df_20k['topic_scores'][i] = json.dumps(list(doc_topic_mat[i]))
# df_20k.head()

In [62]:
# df_20k.to_csv('df_20k.csv', index = False)
df_20k = pd.read_csv('df_20k.csv')
df_20k.head()

Unnamed: 0,index,id,title,selftext,prime_topic,sec_topic,prime_score,sec_score,topic_scores
0,1,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...,0,7,0.408516,0.402199,"[0.4085159025234814, 0.011113989904887907, 0.0..."
1,3,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...,5,0,0.963988,0.004003,"[0.004002516052168223, 0.004002464292135745, 0..."
2,5,krnthg,ICR conference (11th Jan),Any speculation or ideas on what Gamestop migh...,0,5,0.396185,0.371816,"[0.39618512853474486, 0.10163854957411877, 0.0..."
3,7,kuo3w1,"GME is FINALLY going to the moon, this technic...","After some downwards movement, I think everyb...",7,1,0.497566,0.469094,"[0.0041677727694955, 0.469093585637319, 0.0041..."
4,9,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",Guysss... we retards have fantasized a long ti...,5,0,0.793692,0.150852,"[0.15085249345911897, 0.0010311857052622362, 0..."


In [63]:
df_10k.head()

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
0,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...,6,Social Media Links (Many Reposts),5,0.4937,0.417392,"[0.011115580511175473, 0.011114949546913212, 0..."
1,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...,6,Social Media Links (Many Reposts),9,0.877783,0.090211,"[0.004000481503935025, 0.004001206442958103, 0..."
2,krnthg,ICR conference (11th Jan),Any speculation or ideas on what Gamestop migh...,8,News and Earnings Reports,3,0.579641,0.196707,"[0.0026319080133597192, 0.0026324819716669226,..."
3,kuo3w1,"GME is FINALLY going to the moon, this technic...","After some downwards movement, I think everyb...",3,"Opinions about Government, 'shills', troll pos...",6,0.459412,0.397844,"[0.004166976999316615, 0.004167743853592299, 0..."
4,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",Guysss... we retards have fantasized a long ti...,8,News and Earnings Reports,1,0.854413,0.137253,"[0.001041753250742906, 0.13725262819231285, 0...."


In [67]:
for l in top_terms:
    print(l)

['share', 'account', 'transfer', 'broke', 'fidelity', 'drs', 'robinhood', 'gme', 'use', 'trade']
['ape', 'just', 'hold', 'buy', 'fuck', 'like', 'gme', 'know', 'make', 'want']
['com', 'http', 'reddit', 'reddit com', 'www', 'http www', 'www reddit', 'www reddit com', 'http www reddit', 'comments']
['post', 'people', 'know', 'ape', 'just', 'make', 'like', 'think', 'dd', 'gme']
['gme', 'share', 'short', 'volume', '000', 'day', 'etf', '2021', 'data', '10']
['share', 'price', 'short', 'sell', 'buy', 'stock', 'gme', 'squeeze', 'market', 'just']
['http', 'com', 'www', 'http www', 'gamestop', 'html', 'twitter', 'video', 'twitter com', 'youtube']
['png', 'x200b', 'http', 'format', 'redd', 'auto', 'width', 'preview', 'webp', 'preview redd']
['gamestop', 'company', 'game', 'cohen', 'year', 'new', 'news', 'board', 'vote', 'ryan']
['http', 'market', 'www', 'http www', 'short', 'sec', 'com', 'trade', 'fund', 'security']


In [68]:
for l in get_top_terms(lda_20k.components_, vectorizer_20k['features']):
    print(l)

['ape', 'just', 'like', 'money', 'make', 'fuck', 'hold', 'buy', 'know', 'time']
['just', 'ape', 'know', 'post', 'like', 'gme', 'think', 'people', 'dd', 'make']
['share', 'account', 'transfer', 'broke', 'fidelity', 'gme', 'buy', 'just', 'drs', 'ape']
['short', 'market', 'company', 'sec', 'share', 'fund', 'stock', 'gamestop', 'trade', 'investor']
['http', 'com', 'www', 'http www', 'html', 'poll', 'org', 'amp', 'html http', 'video']
['com', 'http', 'www', 'http www', 'gamestop', 'gme', '000', 'stock', '2021', 'youtube']
['share', 'price', 'short', 'buy', 'sell', 'gme', 'stock', 'squeeze', 'market', 'day']
['png', 'http', 'x200b', 'redd', 'format', 'width', 'png width', 'format png', 'auto', 'png auto webp']
['com', 'http', 'reddit', 'www', 'http www', 'reddit com', 'www reddit', 'www reddit com', 'http www reddit', 'comments']
['http', 'jpg', 'x200b', 'format', 'auto', 'redd', 'width', 'preview', 'webp', 'preview redd']


# 8.0 LDA on train/test split

In [69]:
df = pd.read_csv("gme/submissions_reddit.csv")
print(df.columns)
df = df[['id', 'created', 'edited', 'title', 'selftext']] 
df.head()

Index(['id', 'author', 'created', 'retrieved', 'edited', 'pinned', 'archived',
       'locked', 'removed', 'deleted', 'is_self', 'is_video',
       'is_original_content', 'title', 'link_flair_text', 'upvote_ratio',
       'score', 'gilded', 'total_awards_received', 'num_comments',
       'num_crossposts', 'selftext', 'thumbnail', 'shortlink'],
      dtype='object')


Unnamed: 0,id,created,edited,title,selftext
0,ko4pii,2021-01-01 04:08:51,1970-01-01 00:00:00,GME to the moon 🚀🚀,[deleted]
1,kqfajb,2021-01-04 19:02:26,1970-01-01 00:00:00,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...
2,kqjh2t,2021-01-04 22:17:23,1970-01-01 00:00:00,Short Squeeze Incoming 🚀🚀🚀🚀🚀🚀🚀,[deleted]
3,kqvp7l,2021-01-05 10:19:59,1970-01-01 00:00:00,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...
4,krcwch,2021-01-06 01:19:17,1970-01-01 00:00:00,You already know what we must do brothers and ...,[deleted]


In [70]:
len(np.unique(df['edited']))

17908

In [71]:
df['edited'] = pd.to_datetime(df['edited'])
df['created'] = pd.to_datetime(df['created'])

In [72]:
np.unique(df['edited'])

array(['1970-01-01T00:00:00.000000000', '2021-01-06T13:28:54.000000000',
       '2021-01-15T21:04:24.000000000', ...,
       '2021-12-31T21:15:20.000000000', '2021-12-31T22:27:35.000000000',
       '2022-01-01T00:10:16.000000000'], dtype='datetime64[ns]')

In [73]:
df['datetime'] = df[['created', 'edited']].max(axis = 1)
df.sort_values(by = 'datetime')
df['month'] = df['datetime'].dt.month
df = df[['id', 'title', 'selftext', 'datetime', 'month']]
df

Unnamed: 0,id,title,selftext,datetime,month
0,ko4pii,GME to the moon 🚀🚀,[deleted],2021-01-01 04:08:51,1
1,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...,2021-01-04 19:02:26,1
2,kqjh2t,Short Squeeze Incoming 🚀🚀🚀🚀🚀🚀🚀,[deleted],2021-01-04 22:17:23,1
3,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...,2021-01-05 10:19:59,1
4,krcwch,You already know what we must do brothers and ...,[deleted],2021-01-06 01:19:17,1
...,...,...,...,...,...
273322,rt69m9,Longest erection I have ever had....hedgies ar...,,2021-12-31 23:37:25,12
273323,rt6eqg,Putting my good friend on. He’s a dope artist ...,,2021-12-31 23:45:00,12
273324,rt6i5s,A reminder of when the confirmation became real,,2021-12-31 23:50:23,12
273325,rt6mnj,"Ryan Kagy just minted this NFT for 69,420.1337...",,2021-12-31 23:57:04,12


Unnamed: 0,id,title,selftext,datetime,month
0,ko4pii,GME to the moon 🚀🚀,[deleted],2021-01-01 04:08:51,1
1,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...,2021-01-04 19:02:26,1
2,kqjh2t,Short Squeeze Incoming 🚀🚀🚀🚀🚀🚀🚀,[deleted],2021-01-04 22:17:23,1
3,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...,2021-01-05 10:19:59,1
4,krcwch,You already know what we must do brothers and ...,[deleted],2021-01-06 01:19:17,1
...,...,...,...,...,...
273322,rt69m9,Longest erection I have ever had....hedgies ar...,,2021-12-31 23:37:25,12
273323,rt6eqg,Putting my good friend on. He’s a dope artist ...,,2021-12-31 23:45:00,12
273324,rt6i5s,A reminder of when the confirmation became real,,2021-12-31 23:50:23,12
273325,rt6mnj,"Ryan Kagy just minted this NFT for 69,420.1337...",,2021-12-31 23:57:04,12


In [57]:
df = build_and_lemmatize_text_list(df, lemmatize = True, save_string = "df_train_split")

Dateframe size prior to dropping stuff: 273327
New df_train_split.csv file saved to directory.


In [74]:
# df.to_csv('GME_df_w_datetime', index = False)
df = pd.read_csv('GME_df_w_datetime')

In [79]:
df.shape

(94039, 5)

In [76]:
df_train = df.iloc[:91041].dropna(how = 'all')
df_test = df.iloc[91041:].dropna(how = 'all')

In [77]:
df_train

Unnamed: 0,id,title,selftext,datetime,month
0,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,after watch this i take a position right away ...,2021-01-04 19:02:26,1
1,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,this guy explain exactly how to take a positio...,2021-01-05 10:19:59,1
2,krnthg,ICR conference (11th Jan),any speculation or idea on what gamestop might...,2021-01-06 13:28:54,1
3,kuo3w1,"GME is FINALLY going to the moon, this technic...","after some downwards movement , i think everyb...",2021-01-10 21:59:17,1
4,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",guy ... we retard have fantasize a long time a...,2021-01-11 12:42:49,1
...,...,...,...,...,...
91036,pyu50m,So overstock is doing nft and gamestop is doin...,computershare say gamestop be do nft today . o...,2021-09-30 22:10:07,9
91037,pyudt6,Apes hodl down the fort while I go under the k...,go in for surgery tomorrow and i need for all ...,2021-09-30 22:23:22,9
91038,pyufxs,If brokers go bankrupt what will happen to our...,how will the broke pay u if they go bankrupt a...,2021-09-30 22:26:34,9
91039,pyv2ni,Be ZEN,i have notice that there be a post regard nft ...,2021-09-30 23:01:33,9


In [100]:
df_test

Unnamed: 0,id,title,selftext,datetime,month
91041,pywt8e,Am a smooth brain who needs help understanding...,new to gme and need help understand some thing...,2021-10-01 00:41:41,10
91042,pyxdyt,retarded but how do I transfer???,"hi guy , canadian ape here . i hold xx share i...",2021-10-01 01:14:49,10
91043,pyxl8w,Attention Canadian Apes: I need more informati...,"hello ape , i 'm a canadian ape that have be h...",2021-10-01 01:26:47,10
91044,pyxssq,QUESTION about NFT dividend. What happens to t...,i read [ this juicy post ] ( http : //www.redd...,2021-10-01 01:39:18,10
91045,pyxw79,What are we missing with Citadel tweets?,a company with their resource would n't allow ...,2021-10-01 01:44:50,10
...,...,...,...,...,...
94034,rt0gj1,REVERSE REPO = APES RETIREMENT,this be basically excess cash be deposit by ba...,2021-12-31 22:27:35,12
94035,rt21tk,"Last of year purchase, another XX at limit ord...",figure gme be go to close sub- $ 150 for the d...,2021-12-31 20:04:21,12
94036,rt3e78,"Nancy Pelosee Posts Are This Weekend's FUD, Ch...","disclaimer : i be smooth than skippy , this be...",2021-12-31 21:15:20,12
94037,rt4thl,What was the best day for GME hodlers in 2021?...,"today . december 31 , 2021 . because today pro...",2021-12-31 22:23:25,12


In [90]:
doc_list = list(df_train['selftext'])
doc_list_test = list(df_test['selftext'])

In [81]:
vectorizer = get_vectors(doc_list, max_features = 10000, strip_accents = None, preprocessor = None,
                lowercase = True, min_df = 25, max_df = 0.90, ngram_range = (1,3), stop_words = 'english')

In [86]:
print(vectorizer.keys())

dict_keys(['matrix', 'vectorizer', 'features', 'parameters'])


In [87]:
#Train the LDA model on the training data
#WARING: Long process time 12-20 minutes with 300,000+ documents.

tic = timeit.default_timer()

lda = LatentDirichletAllocation(n_components = 10, random_state = None)
lda.fit(vectorizer['matrix'])

toc = timeit.default_timer()
print(str((toc - tic)/60) + " minutes")

5.204736479697749 minutes


In [89]:
lda.components_.shape

(10, 10000)

In [93]:
tf_matrix_train = vectorizer['vectorizer'].transform(doc_list_test)

In [94]:
doc_weights_train = lda.transform(vectorizer['matrix'])
doc_weights_test = lda.transform(tf_matrix_train)

In [102]:
print(doc_weights_train.shape)
print(doc_weights_train[0])

(91041, 10)
[0.01111718 0.01111499 0.01111319 0.01111309 0.25166105 0.65941979
 0.01111613 0.01111676 0.01111523 0.01111259]


In [109]:
print(doc_weights_test.shape)
print(doc_weights_test[-1])

(2998, 10)
[0.0018871  0.00188694 0.00188732 0.09572157 0.00188698 0.00188711
 0.53409632 0.17706133 0.0018873  0.18179805]


In [108]:
doc_weights = np.vstack((doc_weights_train, doc_weights_test))
print(doc_weights.shape)
print(doc_weights[0])
print(doc_weights[-1])

(94039, 10)
[0.01111718 0.01111499 0.01111319 0.01111309 0.25166105 0.65941979
 0.01111613 0.01111676 0.01111523 0.01111259]
[0.0018871  0.00188694 0.00188732 0.09572157 0.00188698 0.00188711
 0.53409632 0.17706133 0.0018873  0.18179805]


In [97]:
df = build_and_simplify_dataframe(['gme'])

df = build_and_lemmatize_text_list(df, lemmatize = False, save_string = None)

Dateframe size prior to dropping stuff: 273327


In [110]:
##ADD NEW COLUMNS TO DATAFRAME WITH TOPIC AND SCORES.

df.reset_index(inplace = True)
df['prime_topic'] = None
df['sec_topic'] = None
df['prime_score'] = None
df['sec_score'] = None
df['topic_scores'] = None
for i in df.index:
    df['prime_topic'][i] = np.argsort(doc_weights[i])[::-1][0]
    df['sec_topic'][i] = np.argsort(doc_weights[i])[::-1][1]
    df['prime_score'][i] = np.sort(doc_weights[i])[::-1][0]
    df['sec_score'][i] = np.sort(doc_weights[i])[::-1][1]
    df['topic_scores'][i] = json.dumps(list(doc_weights[i]))
df.head()

Unnamed: 0,index,id,title,selftext,prime_topic,sec_topic,prime_score,sec_score,topic_scores
0,1,kqfajb,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,After watching this I took a position RIGHT AW...,5,4,0.65942,0.251661,"[0.011117176845150983, 0.011114990660403213, 0..."
1,3,kqvp7l,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,This guy explained exactly how to take a posit...,4,8,0.897184,0.070807,"[0.00400092457317314, 0.004000541141261369, 0...."
2,5,krnthg,ICR conference (11th Jan),Any speculation or ideas on what Gamestop migh...,9,6,0.762863,0.216079,"[0.0026321551501030114, 0.002631694763449787, ..."
3,7,kuo3w1,"GME is FINALLY going to the moon, this technic...","After some downwards movement, I think everyb...",6,1,0.539814,0.276462,"[0.004167448796068806, 0.2764616383772655, 0.0..."
4,9,kv1w9e,"Holly f*ck, our GME rollercoaster will break o...",Guysss... we retards have fantasized a long ti...,9,7,0.858623,0.133042,"[0.0010417494059179412, 0.0010418943881130608,..."


In [111]:
top_terms = get_top_terms(lda.components_, vectorizer['features'])

In [127]:
topic_num = 0
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['market', 'money', 'short', 'fund', 'make', 'stock', 'hedge', 'pay', 'company', 'people'] 

TITLES:
0 .      Wall St corruption
1 .      SUGGESTION TO GOVERNMENT: Crowdsource/outsource SEC’s job. Give the retail investors a paycheck to do SECs job.
2 .      SmithonStocks.com 10 Part Series on DTCC, Cede&Co., MM's, Big Banks and Hedge Funds
3 .      The stock market reminds me of acupuncture.
4 .      What am I missing in the Everything Short = Demise of the USD thesis?
5 .      Alternate Movie Idea - Rogue Benevolent AI
6 .      Point 72 - possible theory
7 .      Mr Gensler This is a National Security Emergency! Our Financial System has been under attack by Hedge Funds and Money Makers! Where are the NSA and FBI Cyber Units?
8 .      POWER to the PEOPLE
9 .      Young people see the stock market as a (gasp) casino!

TEXTS:
TEXT  0 :
 tl;dr This is an excerpt from an anti-corruption blog. It outlines historical regulatory capture, online shilling, fake news etc with regard

Unnamed: 0,index,id,title,selftext,prime_topic,sec_topic,prime_score,sec_score,topic_scores
67794,153080,mgu9a6,Wall St corruption,tl;dr This is an excerpt from an anti-corrupti...,0,9,0.998898,0.000122,"[0.9988981347595073, 0.00012242899304859163, 0..."
76970,177211,mrc0fi,SUGGESTION TO GOVERNMENT: Crowdsource/outsourc...,Retail investors have rallied together and gat...,0,6,0.993524,0.00072,"[0.9935238431134056, 0.0007195363322507346, 0...."
27442,68696,m210i4,"SmithonStocks.com 10 Part Series on DTCC, Cede...",A must read. There is absolutely no way to kno...,0,8,0.993232,0.000752,"[0.9932318042775634, 0.0007519681814660336, 0...."
58151,132171,mcs7v1,The stock market reminds me of acupuncture.,The placebo effect is stronger when the rules ...,0,7,0.991508,0.000944,"[0.9915077114530454, 0.0009434382649054735, 0...."
69714,158131,mhzb1m,What am I missing in the Everything Short = De...,This is an honest question about something I d...,0,9,0.991426,0.000953,"[0.9914262672699152, 0.000952569012765224, 0.0..."
61110,138228,mdjufo,Alternate Movie Idea - Rogue Benevolent AI,"The ""Company""'s internal software developers w...",0,6,0.991087,0.00099,"[0.9910874166247122, 0.0009901432707584029, 0...."
88543,231883,oqt6fp,Point 72 - possible theory,"Can it be that Coin - Base, or the Coin - Base...",0,8,0.989771,0.001137,"[0.9897706913255876, 0.001136689188943921, 0.0..."
86411,214725,nv7tb9,Mr Gensler This is a National Security Emergen...,"Gary , I don’t think you can you handle this o...",0,6,0.98941,0.001177,"[0.989410288287364, 0.0011765584745700441, 0.0..."
90795,249063,pw0kbi,POWER to the PEOPLE,We the people are not allowed to make a capita...,0,7,0.987998,0.001334,"[0.9879978776462286, 0.0013334132680703447, 0...."
25264,63749,m19mby,Young people see the stock market as a (gasp) ...,Of course they do. Young people have seen at l...,0,6,0.987997,0.001334,"[0.9879974270878883, 0.0013334137232996737, 0...."


In [154]:
topics = {}
topics[0] = "Crime/Nefarious Use of Stock Market"

In [155]:
topic_num = 1
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['http', 'com', 'www', 'http www', 'reddit', 'reddit com', 'html', 'www reddit', 'www reddit com', 'http www reddit'] 

TITLES:
0 .      Tomorrow at 2pm the federal reserve is going to announce interest rate predictions. This could be huge. If interest go up then shorting becomes more expensive. Treasury yields are up 1.62% today
1 .      Fails-to-Deliver data probably available this week
2 .      YOU BAN RENSOLE BUT NOT TOASTER?
3 .      Ring ring ring: margin calllllling !!! Citadel issued bonds in 2006 with a BBB+ vs BBB- in 2021
4 .      Since the section "Notable Former Personnel" in the SEC article on Wikipedia has been removed today right after I posted about it on here, I'll post another useful article from Wikipedia
5 .      She said: how $CUM do you have so much $CUM
6 .      DeepFuckingValue Testifed today in Massachetus hearing
7 .      when is earnings release today?
8 .      Maybe the HFs should have listened to Warren B
9 .      Credit Suisse takes hit from U

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
40279,m6ov56,Tomorrow at 2pm the federal reserve is going t...,[federal reserve article ](https://www.cnbc.co...,1,"New Post Links, Other Links",3,0.989284,0.001191,"[0.001190685324402378, 0.9892843427708264, 0.0..."
11186,lrl1ai,Fails-to-Deliver data probably available this ...,February 1st half FTD data could be published ...,1,"New Post Links, Other Links",2,0.988747,0.001251,"[0.00125023960049452, 0.9887471569899713, 0.00..."
74315,mkm6xq,YOU BAN RENSOLE BUT NOT TOASTER?,Toasterrr has a crypt0 wallet in his bio aswel...,1,"New Post Links, Other Links",6,0.988459,0.001283,"[0.0012822514324869448, 0.9884592696488622, 0...."
67299,mgm7ju,Ring ring ring: margin calllllling !!! Citadel...,Citadel issued 500M$ of BBB+ bonds in 2006 vs ...,1,"New Post Links, Other Links",0,0.987999,0.001334,"[0.001333662066433838, 0.9879986241345864, 0.0..."
77740,mubfuf,"Since the section ""Notable Former Personnel"" i...","Before you say anything about it, Wikipedia is...",1,"New Post Links, Other Links",3,0.987914,0.008726,"[0.00015065268445900817, 0.9879136211998955, 0..."
21460,lxzg1j,She said: how $CUM do you have so much $CUM,$CUM $CUM $CUM $CUM $CUM $CUM $CUM $CUM $CUM $...,1,"New Post Links, Other Links",3,0.987324,0.001409,"[0.0014084507042361016, 0.9873236216457285, 0...."
20282,lx5e0r,DeepFuckingValue Testifed today in Massachetus...,[u/DeepFuckingValue](https://www.reddit.com/u/...,1,"New Post Links, Other Links",4,0.985934,0.001563,"[0.001562706878115346, 0.9859344763240148, 0.0..."
93271,rc1fl5,when is earnings release today?,where can i see it?\n\n\nfill text fill text f...,1,"New Post Links, Other Links",5,0.985245,0.00164,"[0.0016393978460747128, 0.9852452595051406, 0...."
50199,maoqwg,Maybe the HFs should have listened to Warren B,"""Never bet against America"" - It appears to me...",1,"New Post Links, Other Links",0,0.985244,0.00164,"[0.001639848933341041, 0.9852436705289831, 0.0..."
65191,mfixed,Credit Suisse takes hit from U.S. hedge fund; ...,"First GS, then Nomura and now Credit Suisse.\n...",1,"New Post Links, Other Links",0,0.984998,0.001667,"[0.0016671975973551933, 0.9849983604345044, 0...."


In [156]:
topics[1] = "New Post Links, Other Links"

In [157]:
topic_num = 2
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['price', 'share', 'buy', 'stock', 'short', 'gme', 'sell', 'day', 'market', 'volume'] 

TITLES:
0 .      GME discuss the possibility of a short squeeze in Q4 report 🚀🚀
1 .      “SHORT SQUEEZE possible” mentioned in GME 10K! They know what’s to come!
2 .      GameStop 10-K Annual report acknowledges the possibility of another short squeeze on their Class A shares bc of extreme shorting on their stock. Slick hint couched as a risk precaution. Very nice.🦍🚀🚀🚀
3 .      buy, hold, shrug, yawn, sigh, buy, hold, shrug, yawn, sigh
4 .      How can we know how much of GME is still shorted?
5 .      How would buying back more than 100% be possible?
6 .      $XRT March 19th rebalance
7 .      Premarket
8 .      I have a question
9 .      Low Volume Theory

TEXTS:
TEXT  0 :
 Please see quoted extract below 🚀🚀🚀🚀

"Risks Related to Our Common Stock

The market price of our Class A Common Stock has been extremely volatile and may continue to be volatile due to numerous circumstances beyon

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
53352,mbphhw,GME discuss the possibility of a short squeeze...,"Please see quoted extract below 🚀🚀🚀🚀\n\n""Risks...",2,Trading/Shorting GME,8,0.997115,0.000321,"[0.000320576187058441, 0.0003205271660432143, ..."
53719,mbqnov,“SHORT SQUEEZE possible” mentioned in GME 10K!...,Wow did anyone see this? A prediction of what’...,2,Trading/Shorting GME,8,0.996808,0.000355,"[0.00035467528783351054, 0.0003546345006502414..."
54322,mbvi0d,GameStop 10-K Annual report acknowledges the p...,On Page 15 they say:\n\n***A “short squeeze” d...,2,Trading/Shorting GME,8,0.996666,0.000371,"[0.00037043612073027834, 0.0003703952170478702..."
39516,m6dmsm,"buy, hold, shrug, yawn, sigh, buy, hold, shrug...","buy, hold, shrug, yawn, sigh, buy, hold, shrug...",2,Trading/Shorting GME,7,0.995544,0.000495,"[0.0004950655325326855, 0.0004950509861853628,..."
49891,majnws,How can we know how much of GME is still shorted?,I'm an ape with no knowledge about stocks and ...,2,Trading/Shorting GME,7,0.989886,0.001124,"[0.001123722261842524, 0.001123629570911756, 0..."
44386,m89ae0,How would buying back more than 100% be possible?,So I am working under the assumption that Hedg...,2,Trading/Shorting GME,7,0.989284,0.001191,"[0.0011906939908408552, 0.0011904956011423728,..."
8403,lnltpj,$XRT March 19th rebalance,I have a question about the March 19th Rebalan...,2,Trading/Shorting GME,8,0.988606,0.001266,"[0.0012660247334172138, 0.001265851360990433, ..."
30649,m2ntx0,Premarket,Looks like GME is down almost 10% premarket. I...,2,Trading/Shorting GME,3,0.986762,0.001471,"[0.0014707504364347525, 0.0014706409568676282,..."
67647,mgrse1,I have a question,I’m still new to this so bull with me . \nI un...,2,Trading/Shorting GME,8,0.986762,0.001471,"[0.0014709124038357504, 0.0014706294545498914,..."
43091,m7u46c,Low Volume Theory,So with XRT scheduled to rebalance tomorrow al...,2,Trading/Shorting GME,7,0.986762,0.001471,"[0.001471051863694702, 0.0014706713552466567, ..."


In [158]:
topics[2] = "Trading/Shorting GME"

In [159]:
topic_num = 3
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['option', 'price', 'order', 'share', 'gme', 'sell', '2021', 'x200b', '10', 'join'] 

TITLES:
0 .      Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n k Join Su p er s to n kJoin Su p er s to n k
1 .      What if we filled the sub with stupidity for bots to feed their algos with shit
2 .      7/14 After hours after math
3 .      Not on the moon, past the moon?
4 .      SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR
5 .      Option Open Interest Shifts for Yesterday, 3/15
6 .      Why so many put options on April 16 for GME?
7 .      It was nice knowing you , but the diamonds are now coal again
8 .      Diamantenhände 💎👐 German market is open 🇩🇪
9 .      Diamantenhände 💎👐 German market is open! 🇩🇪

TEXTS:
TEXT  0 :
 

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
74820,mktn6a,Join Su p er s to n k Join Su p er s to n k Jo...,Join Su p er s to n k Join Su p er s to n k Jo...,3,Data Posts/Nonsense,6,0.999742,2.9e-05,"[2.8719151702376126e-05, 2.8719163147132506e-0..."
43929,m83h6y,What if we filled the sub with stupidity for b...,Imagine posting your positions like\n\n90@350 ...,3,Data Posts/Nonsense,8,0.999654,3.8e-05,"[3.849145119632844e-05, 3.849321381139157e-05,..."
88178,olufb9,7/14 After hours after math,"Hi All, trying to gain a wrinkle here. feel so...",3,Data Posts/Nonsense,7,0.999183,9.1e-05,"[9.075712963016078e-05, 9.074888525439773e-05,..."
52455,mbjeh2,"Not on the moon, past the moon?",Yeah? Yeah? Yeah? Yeah? Yeah? Yeah? Yeah? Yeah...,3,Data Posts/Nonsense,7,0.998831,0.00013,"[0.00012987174811886294, 0.0001298701919155671..."
30750,m2p7ya,SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SS...,SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SSR SS...,3,Data Posts/Nonsense,2,0.998048,0.000217,"[0.00021691973969729415, 0.0002169197396982486..."
39065,m6a9vb,"Option Open Interest Shifts for Yesterday, 3/15","*This ain't financial advice, I have long posi...",3,Data Posts/Nonsense,6,0.997428,0.000286,"[0.0002857465139041023, 0.00028572997504625644..."
20028,lwzrop,Why so many put options on April 16 for GME?,There are unusual number of put options(volume...,3,Data Posts/Nonsense,1,0.994857,0.000572,"[0.0005714549327731552, 0.000571537072827673, ..."
86652,nww6pn,"It was nice knowing you , but the diamonds are...",💎 🙌 💎 🙌 💎 🙌 💎 🙌 💎 🙌 💎 🙌 💎 🙌 💎 🙌 💎 🙌 ...,3,Data Posts/Nonsense,2,0.993617,0.000709,"[0.0007092483396506261, 0.0007092264752941137,..."
23698,m0a3yw,Diamantenhände 💎👐 German market is open 🇩🇪,"It is time for monday, I hope you all had a gr...",3,Data Posts/Nonsense,7,0.993282,0.000747,"[0.0007463951112412646, 0.0007463328788660941,..."
18639,lvw4i0,Diamantenhände 💎👐 German market is open! 🇩🇪,I hope you all will have a fantastic day!🦍\n\n...,3,Data Posts/Nonsense,5,0.993181,0.000758,"[0.0007577095365899231, 0.0007576403454109871,..."


In [160]:
topics[3] = "Data Posts/Nonsense"

In [161]:
topic_num = 4
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['http', 'png', 'com', 'www', 'http www', 'x200b', 'redd', 'width', 'auto', 'format'] 

TITLES:
0 .      drives me crazy
1 .      r/GME Megathread for Tuesday - May 25, 2021
2 .      r/GME Megathread for Monday - September 20, 2021
3 .      r/GME Megathread for Thursday - July 22, 2021
4 .      r/GME Megathread for Tuesday - July 20, 2021
5 .      r/GME Megathread for Monday - July 19, 2021
6 .      AMA Recap
7 .      r/GME Megathread for April 17, 2021
8 .      Daily Discussion Thread & FAQ & Important Announcements
9 .      Daily Discussion Thread & FAQ & Important Announcements

TEXTS:
TEXT  0 :
 I have been trying to draw attention to the fact that the deposit requirement was waved since Wednesday (one day before the hearing). I posted 10 different posts on it. 

[https://www.reddit.com/r/GME/comments/lm5loy/vlad\_testimony\_question/?utm\_source=share&utm\_medium=web2x&context=3](https://www.reddit.com/r/GME/comments/lm5loy/vlad_testimony_question/?utm_source=share&utm

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
9018,lob98y,drives me crazy,I have been trying to draw attention to the fa...,4,"MOD Posts, Many Links",5,0.998723,0.000142,"[0.00014186930792930522, 0.0001418636626904969..."
84419,nkikkc,"r/GME Megathread for Tuesday - May 25, 2021","This is a place to discuss technical analysis,...",4,"MOD Posts, Many Links",9,0.9985,0.000167,"[0.0001667059370776548, 0.0001666940168556269,..."
90497,prosm2,"r/GME Megathread for Monday - September 20, 2021","This is a place to discuss technical analysis,...",4,"MOD Posts, Many Links",5,0.998453,0.000172,"[0.0001718580049202809, 0.00017184978303232568..."
88435,op7wz2,"r/GME Megathread for Thursday - July 22, 2021","This is a place to discuss technical analysis,...",4,"MOD Posts, Many Links",6,0.998081,0.000213,"[0.00021325684132566343, 0.0002132523955620053..."
88350,onw51g,"r/GME Megathread for Tuesday - July 20, 2021","This is a place to discuss technical analysis,...",4,"MOD Posts, Many Links",6,0.998052,0.000217,"[0.00021648436071325622, 0.0002164790174877160..."
88287,on82hf,"r/GME Megathread for Monday - July 19, 2021","This is a place to discuss technical analysis,...",4,"MOD Posts, Many Links",1,0.997788,0.000246,"[0.0002457205029371294, 0.0002457412065658774,..."
63813,memz5j,AMA Recap,*Automod wouldn't let me post a summary or cop...,4,"MOD Posts, Many Links",0,0.996979,0.000336,"[0.0003357242050138274, 0.00033567162820564105..."
77351,msjqh0,"r/GME Megathread for April 17, 2021","This is a place to discuss technical analysis,...",4,"MOD Posts, Many Links",6,0.996265,0.000415,"[0.0004149985491088856, 0.000414999733046001, ..."
76956,mra4xn,Daily Discussion Thread & FAQ & Important Anno...,"This is a place to discuss technical analysis,...",4,"MOD Posts, Many Links",6,0.996265,0.000415,"[0.0004149985491088856, 0.000414999733046001, ..."
76717,mqly1s,Daily Discussion Thread & FAQ & Important Anno...,"This is a place to discuss technical analysis,...",4,"MOD Posts, Many Links",6,0.996103,0.000433,"[0.0004329674435939109, 0.0004329679352918022,..."


In [162]:
topics[4] = "MOD Posts, Many Links"

In [163]:
topic_num = 5
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['share', '000', 'account', 'transfer', 'broke', 'sell', 'fidelity', 'gme', 'robinhood', 'trade'] 

TITLES:
0 .      Canadian Apes: How to transfer your shares from your RBC Direct Investing (RBC DI) account to Computershare. (I just did it, it was pretty painless once I understood the process and had a knowledgeable agent.)
1 .      Newby here....
2 .      Just rec'd this slightly sus. email from APEX
3 .      Did any other WeBull users get this letter from Apex?
4 .      Collateral change coming 4/22 at J.P Morgan
5 .      For all you RH users procrastinating from switching to Fidelity due to FOMO of that MOASS
6 .      The DRS list. For those who wish to direct register, apes salute you!
7 .      Initiated (3) transfers from RH to Fidelity this week. Each were completed within 30 hours of creation.
8 .      Canadian Apes ~ IBKR - WealthSimple, BMO Investorline

TEXTS:
TEXT  0 :
 ***Disclaimer***: I transferred shares from an Investment (Cash) account, not a registered (T

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
90428,pqb703,Canadian Apes: How to transfer your shares fro...,***Disclaimer***: I transferred shares from an...,5,Trading Accounts,9,0.997406,0.000288,"[0.00028824771337231266, 0.000288197015122261,..."
70120,mi6ns7,Newby here....,Still a newby to the whole gme thing jumped in...,5,Trading Accounts,0,0.996399,0.0004,"[0.00040010030558814346, 0.0004000713138368298..."
36953,m5pcv6,Just rec'd this slightly sus. email from APEX,￼\n I have Sofi and have my GME there. Any tho...,5,Trading Accounts,0,0.996249,0.000417,"[0.00041679863373621066, 0.0004167423386088103..."
37204,m5rxy8,Did any other WeBull users get this letter fro...,"Apex Clearing Corporation (""Apex"") is notifyin...",5,Trading Accounts,0,0.99612,0.000431,"[0.00043114555052344456, 0.0004311088078523137..."
65293,mfl2pp,Collateral change coming 4/22 at J.P Morgan,"Apex Clearing Corporation (""Apex"") is notifyin...",5,Trading Accounts,0,0.99612,0.000431,"[0.00043114555052344456, 0.0004311088078523137..."
78202,mvx6d7,For all you RH users procrastinating from swit...,I have put together a little guide on my exper...,5,Trading Accounts,7,0.995651,0.000483,"[0.00048319202604259556, 0.0004831253979871731..."
90165,pn0n52,The DRS list. For those who wish to direct reg...,Inspired by Apes I'm trying to get a list toge...,5,Trading Accounts,7,0.995186,0.000535,"[0.0005348767625284863, 0.0005348286859080134,..."
63022,me805e,Initiated (3) transfers from RH to Fidelity th...,Transfers were partial account transfers each ...,5,Trading Accounts,3,0.994577,0.000603,"[0.0006025531530647454, 0.0006024791447222335,..."
88504,oq5k4q,"Canadian Apes ~ IBKR - WealthSimple, BMO Inves...",Hello my Fellow Apes.\n\nI am a xxxx GME holde...,5,Trading Accounts,0,0.993524,0.00072,"[0.0007197617486502546, 0.0007194461353172055,..."
76177,mov891,WARNING: Last week I made a partial account tr...,**Background:**\n\nI have securities spread ov...,5,Trading Accounts,8,0.992682,0.000813,"[0.0008131742082328424, 0.0008130569226226944,..."


In [164]:
topics[5] = "Trading Accounts"

In [165]:
topic_num = 6
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['post', 'people', 'just', 'like', 'know', 'make', 'think', 'dd', 'gme', 'say'] 

TITLES:
0 .      Can we please limit use of the word SHILL to actual shills?
1 .      Shills have been adapting right in front of our faces and it seems many haven't yet caught on.
2 .      Regarding Drama, Echo-Chambers, Pseudo-celebrities and Power Tripping
3 .      Thank You
4 .      🦍 no fight 🦍 but 🦍 no sell promote. Let your DD talk for yourself.
5 .      Problem with this community calling everyone a shill.
6 .      Hello again, sorry for my previous post
7 .      Clearly the moderators of this sub are confused with definition of "transparency", and it's upsetting to see.
8 .      Give room for different coping mechanisms please
9 .      Potential method for fighting Shills/Bots

TEXTS:
TEXT  0 :
 TL;DR not everyone you disagree with is a shill. Some are trolls, some noobs, some idiots, some just people who disagree. Some are all of the above. 

I probably fit 3/4 of the above: noob idi

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
66732,mgcsbx,Can we please limit use of the word SHILL to a...,TL;DR not everyone you disagree with is a shil...,6,"Complaints/Comments of other users (shills, tr...",8,0.998189,0.000201,"[0.00020125265530098792, 0.0002012268603852578..."
19423,lwh29u,Shills have been adapting right in front of ou...,"Hello again everyone,\n\nI honestly thought th...",6,"Complaints/Comments of other users (shills, tr...",8,0.997846,0.000239,"[0.00023929034778640628, 0.0002392463516243512..."
73298,mkg8a5,"Regarding Drama, Echo-Chambers, Pseudo-celebri...",Hey. Jumping straight to the point.\n\nWith th...,6,"Complaints/Comments of other users (shills, tr...",7,0.996202,0.000422,"[0.0004220379318428678, 0.00042197122872526354..."
74329,mkmcsa,Thank You,Thank you for all the hard work that you did. ...,6,"Complaints/Comments of other users (shills, tr...",9,0.995963,0.000449,"[0.000448580139263299, 0.00044844911553125656,..."
66954,mggfa8,🦍 no fight 🦍 but 🦍 no sell promote. Let your D...,I’d like to throw my 2 cents in about the shil...,6,"Complaints/Comments of other users (shills, tr...",7,0.995871,0.000459,"[0.00045880417240241737, 0.0004587316099483733..."
66067,mfz92y,Problem with this community calling everyone a...,I've seen this on countless posts. Maybe 2 out...,6,"Complaints/Comments of other users (shills, tr...",7,0.995186,0.000535,"[0.000534854996309823, 0.0005348088235439685, ..."
15004,lsxix9,"Hello again, sorry for my previous post",First I just want to say thank you to everyone...,6,"Complaints/Comments of other users (shills, tr...",9,0.994856,0.000572,"[0.0005715275251344349, 0.0005714786928952369,..."
73280,mkg3an,Clearly the moderators of this sub are confuse...,I'd like to remind the mods of this sub that i...,6,"Complaints/Comments of other users (shills, tr...",0,0.994856,0.000572,"[0.0005716179494393705, 0.0005714866292226545,..."
4781,lhif9b,Give room for different coping mechanisms please,Hi All..\n\nJust wanted to share a thought I h...,6,"Complaints/Comments of other users (shills, tr...",7,0.994673,0.000592,"[0.0005918407303752101, 0.0005917408733770764,..."
9551,lpc784,Potential method for fighting Shills/Bots,Shills and misinformation are of particular co...,6,"Complaints/Comments of other users (shills, tr...",3,0.994338,0.000629,"[0.0006291351174950593, 0.0006289824661378241,..."


In [166]:
topics[6] = "Complaints/Comments of other users (shills, trolls, etc.)"

In [167]:
topic_num = 7
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['ape', 'just', 'hold', 'buy', 'fuck', 'know', 'like', 'gme', 'hand', 'want'] 

TITLES:
0 .      when hedgies & cnbc say sell!!!!!!!!
1 .      Just got off the phone with E-Trade on hold for 2 hours. GameStop shares on the way to CS!!! Will post an update when they arrive
2 .      I say Diamond Hands You say Hold!
3 .      I don’t know what to say, really. We are months into the biggest battle of our professional lives, and it all comes down to us apes.
4 .      John Parr - St. Elmo's Fire (Man in Motion)
5 .      Positive Post for 💎🙌
6 .      GME: The game of Tug O' War
7 .      I'm becoming greedy as fuck and I'm loving every second of it 💎
8 .      My car just broke down and i'll be buying GME and start walking instead of fixing it.
9 .      Hammer to Fall (before the rocket launches)

TEXTS:
TEXT  0 :
 🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice
🙈🙉🙊
buy hold not financial advice


Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
24334,m0sa2x,when hedgies & cnbc say sell!!!!!!!!,🙈🙉🙊\nbuy hold not financial advice\n🙈🙉🙊\nbuy h...,7,"Diamond Hands, Apes Vs. Hedge Funds",2,0.99795,0.000228,"[0.0002277966648729307, 0.00022779265270967422..."
93222,rbbhoy,Just got off the phone with E-Trade on hold fo...,Diamond hands Diamond hands Diamond hands Diam...,7,"Diamond Hands, Apes Vs. Hedge Funds",8,0.996853,0.00035,"[0.00034965207340737815, 0.0003496504243779907..."
11561,lrn7x7,I say Diamond Hands You say Hold!,DIAMOND HANDS! HOLD!!!! DIAMOND HANDS! HOLD!!!...,7,"Diamond Hands, Apes Vs. Hedge Funds",8,0.996471,0.000392,"[0.0003921595875943275, 0.0003921572200236522,..."
36925,m5p4v6,"I don’t know what to say, really. We are month...",Now either we hold as a team or we’re gonna cr...,7,"Diamond Hands, Apes Vs. Hedge Funds",5,0.995237,0.000529,"[0.0005292837424248255, 0.0005291207385606375,..."
65120,mfhi1s,John Parr - St. Elmo's Fire (Man in Motion),Anyone else got any GME appropriate music to k...,7,"Diamond Hands, Apes Vs. Hedge Funds",6,0.994444,0.000617,"[0.0006174344799808258, 0.0006173185033506519,..."
16046,ltbyci,Positive Post for 💎🙌,"If you're reading this, it means you're a fell...",7,"Diamond Hands, Apes Vs. Hedge Funds",5,0.993998,0.000667,"[0.0006668559530568705, 0.0006667130266961896,..."
34870,m4rca8,GME: The game of Tug O' War,You vs the hedgies\n\nIf you pull hard enough ...,7,"Diamond Hands, Apes Vs. Hedge Funds",0,0.993022,0.000775,"[0.0007754570383760195, 0.0007752233084711008,..."
26678,m1swry,I'm becoming greedy as fuck and I'm loving eve...,When I first started talking to my wife's boyf...,7,"Diamond Hands, Apes Vs. Hedge Funds",3,0.992912,0.000788,"[0.0007875063920239888, 0.0007874501764691362,..."
81599,n5azus,My car just broke down and i'll be buying GME ...,"Hello fellow apes, i'm a retard from Sweden an...",7,"Diamond Hands, Apes Vs. Hedge Funds",2,0.992798,0.0008,"[0.0008002103382465508, 0.0008001289021230618,..."
74539,mkoy0b,Hammer to Fall (before the rocket launches),Queens knows what's up. Nothing has changed si...,7,"Diamond Hands, Apes Vs. Hedge Funds",8,0.992741,0.000807,"[0.0008066853691157585, 0.000806519575836554, ..."


In [168]:
topics[7] = "Diamond Hands, Apes Vs. Hedge Funds "

In [169]:
topic_num = 8
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['short', 'squeeze', 'gme', 'cover', 'position', 'happen', 'share', 'long', 'hold', 'hedge'] 

TITLES:
0 .      HOLD
1 .      OK Hear me out!
2 .      Hold on, am I missing something?
3 .      Why the congressional hearing on Thursday may not be the catalyst that is needed for a squeeze
4 .      The two ways this could go
5 .      Who are the players in this MOASS?
6 .      Wouldn’t it make more sense to watch for smaller shorting HFs failing margin calls than Citadel and watch for a domino effect?
7 .      $483/share with RH and IB lockout. Selling below $483 is stupid.
8 .      AMC isn’t the naked short problem!
9 .      If all apes hold 1 share from their original position through the entire and buy-in post-MOASS, we would still probably control the float...

TEXTS:
TEXT  0 :
 HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HO

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
11717,lrnq0n,HOLD,HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD HOLD H...,8,Short Squeeze by Holding Stock,7,0.993525,0.00072,"[0.0007194344642761793, 0.0007194267654136258,..."
67257,mgljwc,OK Hear me out!,Hold hold hold hold hold hold hold hold hold h...,8,Short Squeeze by Holding Stock,7,0.990217,0.001087,"[0.0010869882722971183, 0.001086965180389725, ..."
22150,lyhkue,"Hold on, am I missing something?","During the congressional hearing, Gabe Plotkin...",8,Short Squeeze by Holding Stock,7,0.989532,0.001164,"[0.0011632184682831307, 0.0011629065880342077,..."
6249,lketcx,Why the congressional hearing on Thursday may ...,Okay so by this point we all know that a squee...,8,Short Squeeze by Holding Stock,7,0.98941,0.001177,"[0.001176741015710618, 0.0011765213382549367, ..."
35347,m50s9h,The two ways this could go,The way I see this happening is really simple ...,8,Short Squeeze by Holding Stock,0,0.988155,0.001316,"[0.0013162941968633266, 0.001315863598082148, ..."
16810,lu2yam,Who are the players in this MOASS?,So what do you folks think is happening here? ...,8,Short Squeeze by Holding Stock,6,0.98448,0.001725,"[0.0017246410522025986, 0.0017242554618925176,..."
86249,numg1a,Wouldn’t it make more sense to watch for small...,So I gather bringing Shitadel down to a margin...,8,Short Squeeze by Holding Stock,2,0.983016,0.001887,"[0.0018873930502876991, 0.0018868860551226356,..."
12622,lrwets,$483/share with RH and IB lockout. Selling bel...,Just my thesis. We know from Vlad and Thomas P...,8,Short Squeeze by Holding Stock,2,0.981247,0.002084,"[0.002083783643490344, 0.0020834090364702405, ..."
86057,nt36du,AMC isn’t the naked short problem!,"Ok what if Citadel have power over CNBC, Melis...",8,Short Squeeze by Holding Stock,0,0.980847,0.002129,"[0.0021285018228740537, 0.0021279139946592373,..."
82597,n8hw2d,If all apes hold 1 share from their original p...,"Theoretically, if SHFs have shorted the entire...",8,Short Squeeze by Holding Stock,7,0.978566,0.002382,"[0.002381486189471185, 0.0023810224552835236, ..."


In [170]:
topics[8] = "Short Squeeze by Holding Stock"

In [171]:
topic_num = 9
print("Top Terms: ", top_terms[topic_num], "\n")
df_temp = df[df['prime_topic'] == topic_num].sort_values('prime_score', ascending = False).head(10)
print('TITLES:')
for i, title in enumerate(df_temp['title']):
    print(i, ".     ", title)
print('\nTEXTS:')
for i, text in enumerate(df_temp['selftext']):
    print("TEXT ", i, ":\n", text, "\n\n\n")
df_temp

Top Terms:  ['gamestop', 'company', 'game', 'stock', 'like', 'vote', 'cohen', 'like stock', 'amp', 'news'] 

TITLES:
0 .      My Honest Thoughts on GME
1 .      JACKED TO THE TITS? Relieve some stress with bubble wrap
2 .      4 Additional Board Members Expected to leave in June. Possible additional changes to senior executives.
3 .      Game Stop hearing part II, witnesses and their written testimony.
4 .      Links to all witness testimonies
5 .      GAMESTOP NEWS RELEASE
6 .      GameStop Appoints Chief Growth Officer Announces Two Additional Executive Hires to Support Transformation
7 .      GameStop appoints Chief Growth Officer Elliot Wilke 30th Mar 2021 plus 2 VPs 🚀🚀
8 .      RYAN COHEN FOR BOARD DIRECTOR! 9TH JUNE ANNUAL MEETING!
9 .      Some bubble wrap to get you through the weekend

TEXTS:
TEXT  0 :
 I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the stock.
I like the sto

Unnamed: 0,id,title,selftext,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
63759,melyvr,My Honest Thoughts on GME,I like the stock.\nI like the stock.\nI like t...,9,,2,0.999896,1.2e-05,"[1.1577191975706627e-05, 1.1576881230692815e-0..."
69420,mhuvt0,JACKED TO THE TITS? Relieve some stress with b...,HOW JACKED ARE YOU?\n\n>!pop!< >!pop!< >!pop!<...,9,,7,0.998594,0.000156,"[0.00015625039204550153, 0.0001562500115275742..."
53849,mbrjug,4 Additional Board Members Expected to leave i...,[Buried in the 10-K that Gamestop released tod...,9,,3,0.997297,0.0003,"[0.0003003441569304925, 0.0003003571922321434,..."
40269,m6ojsn,"Game Stop hearing part II, witnesses and their...",EDIT 1: I believe you should all read Michael ...,9,,8,0.997087,0.000324,"[0.00032372887283135804, 0.0003237255850606712..."
6759,lm8vif,Links to all witness testimonies,DFV: \n[https://docs.house.gov/meetings/BA/BA...,9,,1,0.996939,0.00034,"[0.00034016033759674846, 0.0003402546871239961..."
10625,lqtr5c,GAMESTOP NEWS RELEASE,[https://news.gamestop.com/news-releases/news-...,9,,5,0.995336,0.000518,"[0.0005181912156153124, 0.0005182328648812509,..."
66749,mgd6n4,GameStop Appoints Chief Growth Officer Announc...,Thought I'd share...\n\nGLOBENEWSWIRE 3:45 AM ...,9,,3,0.995134,0.000541,"[0.0005406630276684973, 0.000540594233356136, ..."
66821,mgeggn,GameStop appoints Chief Growth Officer Elliot ...,https://news.gamestop.com/news-releases/news-r...,9,,0,0.994856,0.000572,"[0.0005715652215522834, 0.000571494770754272, ..."
75554,mmprm9,RYAN COHEN FOR BOARD DIRECTOR! 9TH JUNE ANNUAL...,Press release from GameStop Website:\n\n>GRAPE...,9,,1,0.994767,0.000582,"[0.000581501772165176, 0.0005815303495417381, ..."
63714,melfcp,Some bubble wrap to get you through the weekend,I've hidden a gme share for you since you can'...,9,,2,0.994767,0.000582,"[0.000581449077988987, 0.00058141964669626, 0...."


In [172]:
topics[9] = "Press Releases/News"

In [173]:
cohe_scores = cohe_score_func(lda, vectorizer['matrix'], vectorizer['features'])
cohe_scores

(-1.3833405708724245,
 [-1.2971767982697149,
  -0.9447542801849123,
  -1.3256093515871266,
  -1.897065351454387,
  -0.9541736602119025,
  -1.6873388640628844,
  -0.970468425109287,
  -1.1887898428118846,
  -1.2570197654198243,
  -2.311009369612322])

In [176]:
for i, key in enumerate(topics.keys()):
    top = topics[key]
    cohe = cohe_scores[1][i]
    avg = df[df['prime_topic'] == key]['prime_score'].mean()
    count = df[df['prime_topic'] == key]['prime_score'].count()
    print("Topic ", key, " is assigned to ", count, " documents - ", top)
    print("Avg. probability: ", avg, ", Coherence: ", cohe, "\n")

Topic  0  is assigned to  5595  documents -  Crime/Nefarious Use of Stock Market
Avg. probability:  0.5710298852312424 , Coherence:  -1.2971767982697149 

Topic  1  is assigned to  4264  documents -  New Post Links, Other Links
Avg. probability:  0.6945787325951245 , Coherence:  -0.9447542801849123 

Topic  2  is assigned to  9141  documents -  Trading/Shorting GME
Avg. probability:  0.6198865508214828 , Coherence:  -1.3256093515871266 

Topic  3  is assigned to  2897  documents -  Data Posts/Nonsense
Avg. probability:  0.5849351586266535 , Coherence:  -1.897065351454387 

Topic  4  is assigned to  6730  documents -  MOD Posts, Many Links
Avg. probability:  0.7243048044864101 , Coherence:  -0.9541736602119025 

Topic  5  is assigned to  6974  documents -  Trading Accounts
Avg. probability:  0.6385478373457785 , Coherence:  -1.6873388640628844 

Topic  6  is assigned to  16885  documents -  Complaints/Comments of other users (shills, trolls, etc.)
Avg. probability:  0.656112363477439 , 

In [177]:
df['topic'] = df['prime_topic'].map(topics)

In [181]:
df = df[['id', 'prime_topic', 'topic',
       'sec_topic', 'prime_score', 'sec_score', 'topic_scores']]
df

Unnamed: 0,id,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
0,kqfajb,5,Trading Accounts,4,0.65942,0.251661,"[0.011117176845150983, 0.011114990660403213, 0..."
1,kqvp7l,4,"MOD Posts, Many Links",8,0.897184,0.070807,"[0.00400092457317314, 0.004000541141261369, 0...."
2,krnthg,9,Press Releases/News,6,0.762863,0.216079,"[0.0026321551501030114, 0.002631694763449787, ..."
3,kuo3w1,6,"Complaints/Comments of other users (shills, tr...",1,0.539814,0.276462,"[0.004167448796068806, 0.2764616383772655, 0.0..."
4,kv1w9e,9,Press Releases/News,7,0.858623,0.133042,"[0.0010417494059179412, 0.0010418943881130608,..."
...,...,...,...,...,...,...,...
94034,rt0gj1,0,Crime/Nefarious Use of Stock Market,6,0.647157,0.199466,"[0.6471571631425317, 0.0006623205170025277, 0...."
94035,rt21tk,3,Data Posts/Nonsense,6,0.687784,0.288683,"[0.0029418353213479316, 0.0029413130444606744,..."
94036,rt3e78,6,"Complaints/Comments of other users (shills, tr...",0,0.698473,0.266736,"[0.26673640358182354, 0.0008197966524948154, 0..."
94037,rt4thl,7,"Diamond Hands, Apes Vs. Hedge Funds",9,0.443612,0.415039,"[0.0037051141132133664, 0.0037045295082803703,..."


In [183]:
#df.to_csv('df_test_train_split.csv', index = False)
df = pd.read_csv('df_test_train_split.csv')
df

Unnamed: 0,id,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
0,kqfajb,5,Trading Accounts,4,0.659420,0.251661,"[0.011117176845150983, 0.011114990660403213, 0..."
1,kqvp7l,4,"MOD Posts, Many Links",8,0.897184,0.070807,"[0.00400092457317314, 0.004000541141261369, 0...."
2,krnthg,9,Press Releases/News,6,0.762863,0.216079,"[0.0026321551501030114, 0.002631694763449787, ..."
3,kuo3w1,6,"Complaints/Comments of other users (shills, tr...",1,0.539814,0.276462,"[0.004167448796068806, 0.2764616383772655, 0.0..."
4,kv1w9e,9,Press Releases/News,7,0.858623,0.133042,"[0.0010417494059179412, 0.0010418943881130608,..."
...,...,...,...,...,...,...,...
94034,rt0gj1,0,Crime/Nefarious Use of Stock Market,6,0.647157,0.199466,"[0.6471571631425317, 0.0006623205170025277, 0...."
94035,rt21tk,3,Data Posts/Nonsense,6,0.687784,0.288683,"[0.0029418353213479316, 0.0029413130444606744,..."
94036,rt3e78,6,"Complaints/Comments of other users (shills, tr...",0,0.698473,0.266736,"[0.26673640358182354, 0.0008197966524948154, 0..."
94037,rt4thl,7,"Diamond Hands, Apes Vs. Hedge Funds",9,0.443612,0.415039,"[0.0037051141132133664, 0.0037045295082803703,..."


In [184]:
df_train = df.iloc[:91041].dropna(how = 'all')
df_test = df.iloc[91041:].dropna(how = 'all')
df_train.to_csv('df_train_split.csv', index = False)
df_test.to_csv('df_test_split.csv', index = False)

In [186]:
df_test

Unnamed: 0,id,prime_topic,topic,sec_topic,prime_score,sec_score,topic_scores
91041,pywt8e,2,Trading/Shorting GME,5,0.721305,0.123256,"[0.10822381089913219, 0.0009804200531817325, 0..."
91042,pyxdyt,5,Trading Accounts,7,0.639807,0.326857,"[0.004167091076040238, 0.004166711725163904, 0..."
91043,pyxl8w,5,Trading Accounts,7,0.692198,0.275678,"[0.0016668891591123578, 0.0016666951705405835,..."
91044,pyxssq,8,Short Squeeze by Holding Stock,4,0.263467,0.253595,"[0.05847337880283936, 0.0008622849512094111, 0..."
91045,pyxw79,0,Crime/Nefarious Use of Stock Market,6,0.530334,0.431566,"[0.5303336975567461, 0.00476223966496509, 0.00..."
...,...,...,...,...,...,...,...
94034,rt0gj1,0,Crime/Nefarious Use of Stock Market,6,0.647157,0.199466,"[0.6471571631425317, 0.0006623205170025277, 0...."
94035,rt21tk,3,Data Posts/Nonsense,6,0.687784,0.288683,"[0.0029418353213479316, 0.0029413130444606744,..."
94036,rt3e78,6,"Complaints/Comments of other users (shills, tr...",0,0.698473,0.266736,"[0.26673640358182354, 0.0008197966524948154, 0..."
94037,rt4thl,7,"Diamond Hands, Apes Vs. Hedge Funds",9,0.443612,0.415039,"[0.0037051141132133664, 0.0037045295082803703,..."


In [148]:
topics

{0: 'Crime/Nefarious Use of Stock Market',
 2: 'Trading/Shorting GME',
 7: 'Diamond Hands, Apes Vs. Hedge Funds ',
 6: 'Complaints/Comments of other users (shills, trolls, etc.)',
 5: 'Trading Accounts',
 4: 'MOD Posts, Many Links',
 3: 'Data Posts/Nonsense',
 1: 'New Post Links, Other Links',
 8: 'Short Squeeze by Holding Stock'}

In [125]:
topic_dict

{0: 'Trading Accounts',
 2: 'Reddit MOD Announcement',
 3: "Opinions about Government, 'shills', troll posts, etc.",
 4: 'Shorting and Sharing Financial Data',
 5: 'General Advice/Questions about Stocks',
 6: 'Social Media Links (Many Reposts)',
 7: 'Posts of Memes/Images',
 8: 'News and Earnings Reports',
 9: 'Trading Rules and Regulations',
 1: "'Diamond Hands', Buy and Hold"}