## Gathering the Data
The first step is to gather a large amount of data and to store it in a pandas dataframe.

In [130]:
import pandas as pd
import praw
import secrets
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


In [5]:
user_agent = "Subreddit-Predictor 0.1 by /u/IsThisATrollBot"

reddit = praw.Reddit(
    client_id=secrets.client_ID,
    client_secret=secrets.client_secret,
    password=secrets.password,
    user_agent=user_agent,
    username=secrets.username,
)

Because pushshift is down, we are limited to the amount of data we can gather at a time. So we will choose posts from the 10 most popular subreddits.

In [6]:
# Start with a list of subreddits
top_subreddits = ['announcements', 'funny', 'AskReddit', 'gaming', 'Awww', 'Music', 'pics', 'science', 'worldnews', 'videos', 'AmItheAsshole']

In [7]:
# Create an empty list to store the posts
posts = []

# Iterate through the subreddits and get the last 1000 posts from each
for sub in top_subreddits:
    subreddit_posts = reddit.subreddit(sub).new(limit=1000)
    for post in subreddit_posts:
        posts.append(post)

In [95]:
# Create a list of dictionaries containing the data for each post
data = [{'id': post.id, 'title': post.title, 'subreddit': post.subreddit.display_name} for post in posts]

# Create a Pandas dataframe from the list of dictionaries
df = pd.DataFrame(data)


In [273]:
test_titles = ['Redditors of Reddit. What is your favorite piece of Reddit history?', 'WIBTA if I stole my younger brothers lunch money?', 'check out this cool video I found', 'asdf', 'cats are dangerous', 'new study shows cats are dangerous', 'reddit cool aita']
test_titles = pd.DataFrame({'title':test_titles})


In [349]:
class Subreddit_Predictor:
    def __init__(self):
        self.raw_data = pd.DataFrame({'id':[], 'title':[], 'subreddit':[]})
        self.subreddits = []
        self.data = pd.DataFrame({'id':[], 'title':[], 'subreddit':[]})
        self.Feature_Vectors = {}
        self.Embedding = {}
        self.Title_Vectorizers = {}

    def add_data(self, df):
        """df is a pandas DataFrame with columns={'title':[], 'subreddit':[]}. It will be merged with the existing raw_data"""
        self.raw_data = pd.concat([self.raw_data, df]).drop_duplicates(subset='id')

    def clean_data(self):
        """Cleans the data in raw_data and updates self.data"""

        df = self.raw_data

        # Remove all non-alpha-numeric characters
        df['title'] = df['title'].str.replace(r'[^a-zA-Z0-9 ]', '', regex = True)

        # Make all the text lowercase
        df['title'] = df['title'].str.lower()

        # Remove empty rows
        df['title'] = df['title'].str.strip()
        filter = df['title'] == ''
        df = df.drop(df[filter].index)

        # Store it as
        self.data = df

        #update the subreddits attribute
        self.subreddits = self.data['subreddit'].unique().tolist()

    def ready_data(self, test_size = .2, seed = 42):
        """Splits and encodes the data. Saves is in X_train, Y_train, X_test, Y_test."""

        # Change the index
        self.data = self.data.set_index('id')

        # Encode the subreddits
        self._le = LabelEncoder()
        self.data['subreddit_num'] = self._le.fit_transform(self.data['subreddit'])

        # Split the data
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.data['title'], self._le.fit_transform(self.data['subreddit']), test_size=test_size, random_state = seed)

    def add_title_vectorizer(self, title_vectorizer):
        """This is how we add a title_vectorizer to our collection"""
        title_vectorizer.train(self.X_train)
        self.Title_Vectorizers[title_vectorizer.featureName] = title_vectorizer
        self.Feature_Vectors[title_vectorizer.featureName] = title_vectorizer.vectorize(self.X_train)


    def generate_features(self, featureName):
        """Generates the features using the different methods we have created"""

        if featureName == 'BoW':
            self.Embedding['BoW'] = CountVectorizer()
            self.Features['BoW'] = self.Embedding['BoW'].fit_transform(self.X_train)

        if featureName == 'D2V':

            # Create a list of TaggedDocument objects from the titles
            X_train_tagged = self.X_train.tolist()
            X_train_tagged = [TaggedDocument(words=title.split(), tags=[str(i)]) for i, title in enumerate(X_train_tagged)]
            X_test_tagged = self.X_test.tolist()
            X_test_tagged = [TaggedDocument(words=title.split(), tags=[str(i)]) for i, title in enumerate(X_test_tagged)]

            model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
            model_dbow.build_vocab(X_train_tagged)

            # Train the model
            model_dbow.train(X_train_tagged, total_examples=model_dbow.corpus_count, epochs=100)

            # Get the vectorized titles from the doc2vec model
            vectors = [model_dbow.infer_vector(title.split()) for title in X_train.tolist()]

            # Add the vectors to the dataframe as a new column
            df_new = pd.DataFrame({'title':X_train, 'vector': vectors})
            df_new

    def vectorize(self, featureName, x):
        """Turns a sentence or list of sentences into a feature vectors"""

        if type(x) == str: return self.vectorize(featureName, [x])

        else:
            if featureName == 'BoW':
                return self.Embedding['BoW'].transform(x).toarray()



In [350]:
class Title_Vectorizer:
    """This class is to hold all of the Title Vectorizers, like Bag-of-Words and Doc2Vec. Each vectorizer is a specific object. The class methods all have the same input/output."""
    def __init__(self, featureName):
        self.featureName = featureName
        self.description = "Description goes here"

    def train(self, X_train):
        """Inputs the training data. Creates the self.model"""

        self.model = self._train(X_train)

    def _train(self, X_train):
        """Just a place holder for the actual function"""
        #pass

    def vectorize(self, df_titles):
        """Given a data frame or series with only titles, will return a df of all of the features, indexed by id. The actual function will be added to each object."""

        return self._vectorize(df_titles, self.model)

    def _vectorize(self, df_titles, model):
        """Just a place holder for the actual function."""
        #pass



In [351]:
BoW_model = Title_Vectorizer('BoW')

def _BoW_vectorize(df_titles, model):
    """I think I need to drop every word that's not in the vocabulary."""

    if type(df_titles) == pd.core.frame.DataFrame:
        titles = df_titles['title']
    else:
        titles = df_titles

    vocab = model.vocabulary_

    titles = titles.apply(lambda s: ' '.join(set(s.split()).intersection(vocab)))
    temp = model.transform(titles)
    temp = temp.toarray()
    temp = pd.DataFrame(temp)
    temp['id'] =df_titles.index
    temp = temp.set_index('id')
    return temp

def _BoW_train(X_train):
    model = CountVectorizer()
    model.fit_transform(X_train)
    return model

BoW_model._vectorize = _BoW_vectorize
BoW_model._train = _BoW_train

BoW_model.train(obj.X_train)

In [353]:
BoW_model = Title_Vectorizer('BoW')
BoW_model._vectorize = _BoW_vectorize
BoW_model._train = _BoW_train
#BoW_model.train(obj.X_train)
#BoW_model.vectorize(obj.X_train)


In [330]:
#_BoW_train(obj.X_train)
BoW_model._train = _BoW_train
BoW_model.train(obj.X_train)
type(BoW_model.model)

sklearn.feature_extraction.text.CountVectorizer

In [307]:
type(BoW_model.model)

NoneType

In [283]:
BoW_model.model.transform(list(test_titles['title'])).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [203]:
import pandas as pd

# Create a sample pandas series
s = pd.Series(['I love dogs', 'I hate cats', 'I like turtles'])

# Create a vocabulary
vocab = ['I', 'love', 'hate', 'like']

# Remove words from the sentences that are not in the vocabulary
filtered_s = s.apply(lambda x: ' '.join([word for word in x.split() if word in vocab]))

# Print the filtered series
print(filtered_s)
# Remove words from the sentences that are not in the vocabulary
filtered_s = s.apply(lambda x: ' '.join(set(x.split()).intersection(vocab)))

# Print the filtered series
print(filtered_s)


0    I love
1    I hate
2    I like
dtype: object
0    I love
1    I hate
2    I like
dtype: object


In [190]:
x = CountVectorizer()
x.fit_transform(obj.X_train)
vocab = x.vocabulary_
'im' in vocab

True

In [277]:
test_titles

Unnamed: 0,title
0,Redditors of Reddit. What is your favorite pie...
1,WIBTA if I stole my younger brothers lunch money?
2,check out this cool video I found
3,asdf
4,cats are dangerous
5,new study shows cats are dangerous
6,reddit cool aita


In [189]:
BoW_model.vectorize(pd.DataFrame({'title':test_titles}))

AttributeError: 'Series' object has no attribute 'split'

In [157]:
pd.DataFrame(x, obj.X_train.index).info()

<class 'pandas.core.frame.DataFrame'>
Index: 6477 entries, zmng91 to zodvmd
Columns: 13136 entries, 0 to 13135
dtypes: int64(13136)
memory usage: 649.4+ MB


In [149]:
pd.DataFrame({'title':obj.X_train, 'vector': x})

ValueError: Per-column arrays must each be 1-dimensional

In [354]:
obj = Subreddit_Predictor()
obj.add_data(df)
obj.clean_data()
obj.ready_data(test_size=.3, seed=29)
obj.add_title_vectorizer(BoW_model)


In [357]:
obj.Feature_Vectors['BoW']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,13126,13127,13128,13129,13130,13131,13132,13133,13134,13135
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
zmng91,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zrve0c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zppddb,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
z4m4c6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zeb9r7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zo36za,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zog02f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zophno,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
z7sghp,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [159]:
def foo(x):
    print ('hello',x)

obj.fun = foo

obj.fun(2)

hello 2


In [9]:




# Convert the labels to numerical values
le = LabelEncoder()
df['subreddit_num'] = le.fit_transform(df['subreddit'])

df = df.drop(columns=['subreddit'])

#df['subreddit'] = le.inverse_transform(df['subreddit_num'])

df


Unnamed: 0,id,title,subreddit
0,t93ec3,This subreddit is closed for new posts and com...,announcements
1,pg006s,COVID denialism and policy clarifications,announcements
2,pbmy5y,"Debate, dissent, and protest on Reddit",announcements
3,nw2hs6,Sunsetting Secret Santa and Reddit Gifts,announcements
4,mi01fg,Second,announcements
...,...,...,...
9261,zq0n2b,WIBTA For Exposing My Dad to My Mom?,AmItheAsshole
9262,zq0kzb,AITA for trying to rescue/take home/whatever a...,AmItheAsshole
9263,zq0kv9,AITA for not wanting to gift hotel soaps for C...,AmItheAsshole
9264,zq0k55,AITA for walking my dog on my own street?,AmItheAsshole


In [65]:
df_new = pd.DataFrame({'id':['pg006s'], 'title':[a], 'subreddit':['announcements']}).set_index('id')

In [66]:
pd.concat([df_new, df]).drop_duplicates(keep = False)

Unnamed: 0_level_0,title,subreddit
id,Unnamed: 1_level_1,Unnamed: 2_level_1
pg006s,COVID denialism and policy clarifications,announcements
pbmy5y,"Debate, dissent, and protest on Reddit",announcements
nw2hs6,Sunsetting Secret Santa and Reddit Gifts,announcements
mi01fg,Second,announcements
mcisdf,An update on the recent issues surrounding a R...,announcements
...,...,...
zq0n2b,WIBTA For Exposing My Dad to My Mom?,AmItheAsshole
zq0kzb,AITA for trying to rescue/take home/whatever a...,AmItheAsshole
zq0kv9,AITA for not wanting to gift hotel soaps for C...,AmItheAsshole
zq0k55,AITA for walking my dog on my own street?,AmItheAsshole


In [71]:
df.drop_duplicates(keep = 'first')

Unnamed: 0_level_0,title,subreddit
id,Unnamed: 1_level_1,Unnamed: 2_level_1
t93ec3,This subreddit is closed for new posts and com...,announcements
pg006s,COVID denialism and policy clarifications,announcements
pbmy5y,"Debate, dissent, and protest on Reddit",announcements
nw2hs6,Sunsetting Secret Santa and Reddit Gifts,announcements
mi01fg,Second,announcements
...,...,...
zq0n2b,WIBTA For Exposing My Dad to My Mom?,AmItheAsshole
zq0kzb,AITA for trying to rescue/take home/whatever a...,AmItheAsshole
zq0kv9,AITA for not wanting to gift hotel soaps for C...,AmItheAsshole
zq0k55,AITA for walking my dog on my own street?,AmItheAsshole


In [73]:
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({'A': [1, 2, 2, 3, 3], 'B': [4, 5, 5, 6, 6], 'C': [7, 8, 8, 9, 9]})

# Find duplicate rows
duplicate_rows = df[df.duplicated()]

# Print the duplicate rows
print(duplicate_rows)


   A  B  C
2  2  5  8
4  3  6  9


In [78]:
df[df.duplicated()]

Unnamed: 0_level_0,title,subreddit
id,Unnamed: 1_level_1,Unnamed: 2_level_1
c0gl6,"We are aware that reddit appears hung over, an...",announcements
zsrwxs,What made you want to have kids?,AskReddit
zspw41,What do you want for Christmas?,AskReddit
zsppnr,What made you want to have kids?,AskReddit
zspbbv,What do you want for Christmas?,AskReddit
zsp834,What made you want to have kids?,AskReddit
zso4w7,What is on your Christmas wishlist?,AskReddit
zso42r,People who have their desserts before their ma...,AskReddit
zshreq,does crashing and desabling gpu driver means t...,gaming
zs6y9p,Kingdoms of Amalur: Re-Reckoning worth $12?,gaming
