## Gathering the Data
The first step is to gather a large amount of data and to store it in a pandas dataframe.

In [398]:
import pandas as pd
import praw
import secrets
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.svm import SVC


In [5]:
user_agent = "Subreddit-Predictor 0.1 by /u/IsThisATrollBot"

reddit = praw.Reddit(
    client_id=secrets.client_ID,
    client_secret=secrets.client_secret,
    password=secrets.password,
    user_agent=user_agent,
    username=secrets.username,
)

Because pushshift is down, we are limited to the amount of data we can gather at a time. So we will choose posts from the 10 most popular subreddits.

In [6]:
# Start with a list of subreddits
top_subreddits = ['announcements', 'funny', 'AskReddit', 'gaming', 'Awww', 'Music', 'pics', 'science', 'worldnews', 'videos', 'AmItheAsshole']

In [7]:
# Create an empty list to store the posts
posts = []

# Iterate through the subreddits and get the last 1000 posts from each
for sub in top_subreddits:
    subreddit_posts = reddit.subreddit(sub).new(limit=1000)
    for post in subreddit_posts:
        posts.append(post)

In [95]:
# Create a list of dictionaries containing the data for each post
data = [{'id': post.id, 'title': post.title, 'subreddit': post.subreddit.display_name} for post in posts]

# Create a Pandas dataframe from the list of dictionaries
df = pd.DataFrame(data)


In [273]:
test_titles = ['Redditors of Reddit. What is your favorite piece of Reddit history?', 'WIBTA if I stole my younger brothers lunch money?', 'check out this cool video I found', 'asdf', 'cats are dangerous', 'new study shows cats are dangerous', 'reddit cool aita']
test_titles = pd.DataFrame({'title':test_titles})


# Main Subreddit Predictor Class

This will have as attributes the Feature Vectorizers and the Classifiers, which themselves are objects of other classes.

In [439]:
class Subreddit_Predictor:
    def __init__(self):
        self.raw_data = pd.DataFrame({'id':[], 'title':[], 'subreddit':[]})
        self.subreddits = []
        self.data = pd.DataFrame({'id':[], 'title':[], 'subreddit':[]})
        self.Feature_Vectors = {}
        self.Embedding = {}
        self.Title_Vectorizers = {}
        self.Classifiers = {}
        self.Models = {}
        self.Models_info = {}

    def add_data(self, df):
        """df is a pandas DataFrame with columns={'title':[], 'subreddit':[]}. It will be merged with the existing raw_data"""
        self.raw_data = pd.concat([self.raw_data, df]).drop_duplicates(subset='id')

    def clean_data(self):
        """Cleans the data in raw_data and updates self.data"""

        df = self.raw_data

        # Remove all non-alpha-numeric characters
        df['title'] = df['title'].str.replace(r'[^a-zA-Z0-9 ]', '', regex = True)

        # Make all the text lowercase
        df['title'] = df['title'].str.lower()

        # Remove empty rows
        df['title'] = df['title'].str.strip()
        filter = df['title'] == ''
        df = df.drop(df[filter].index)

        # Store it as
        self.data = df

        #update the subreddits attribute
        self.subreddits = self.data['subreddit'].unique().tolist()

    def ready_data(self, test_size = .2, seed = 42):
        """Splits and encodes the data. Saves is in X_train, Y_train, X_test, Y_test."""

        # Change the index
        self.data = self.data.set_index('id')

        # Encode the subreddits
        self._le = LabelEncoder()
        self.data['subreddit_num'] = self._le.fit_transform(self.data['subreddit'])

        # Split the data
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.data['title'], self._le.fit_transform(self.data['subreddit']), test_size=test_size, random_state = seed)

    def add_title_vectorizer(self, title_vectorizer):
        """This is how we add a title_vectorizer to our collection"""
        title_vectorizer.train(self.X_train)
        self.Title_Vectorizers[title_vectorizer.featureName] = title_vectorizer
        self.Feature_Vectors[title_vectorizer.featureName] = title_vectorizer.vectorize(self.X_train)

    def add_classifier(self, classifier):
        """We add the classifier to our collection, self.Classifiers"""
        self.Classifiers[classifier.classifierName] = classifier

    def train_model(self, modelName, featureName, classifierName, description = ''):
        """
        :param modelName: The name of this model
        :param featureName: Which feature vectors are we using?
        :param classifierName: Which classifier are we using?
        :param description: Write a short discription of the model (optional).
        :return: Adds a trained object of the classifier class to self.Models
        """

        self.Models_info[modelName] = {'featureName':featureName, 'classifierName':classifierName, 'description':description}

        X_train = self.Feature_Vectors[featureName]
        Y_train = self.Y_train
        classifier = self.Classifiers[classifierName]
        classifier.train(X_train, Y_train)

        self.Models[modelName] = classifier


    def predictions(self, modelName, titles):
        """
        :param modelName: Which model are we using?
        :param titles: A list or series of titles
        :return: A data frame of 'title' and 'prediction'
        """

        model = self.Models[modelName]

        featureName = self.Models_info[modelName]['featureName']
        vectorizer = self.Title_Vectorizers[featureName]

        title_vectors = vectorizer.vectorize(titles)

        df = model.predict(title_vectors)
        #df['prediction'] = self._le.inverse_transform(df['prediction'])

        return df




    def generate_features(self, featureName):
        """Generates the features using the different methods we have created"""

        if featureName == 'BoW':
            self.Embedding['BoW'] = CountVectorizer()
            self.Features['BoW'] = self.Embedding['BoW'].fit_transform(self.X_train)

        if featureName == 'D2V':

            # Create a list of TaggedDocument objects from the titles
            X_train_tagged = self.X_train.tolist()
            X_train_tagged = [TaggedDocument(words=title.split(), tags=[str(i)]) for i, title in enumerate(X_train_tagged)]
            X_test_tagged = self.X_test.tolist()
            X_test_tagged = [TaggedDocument(words=title.split(), tags=[str(i)]) for i, title in enumerate(X_test_tagged)]

            model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
            model_dbow.build_vocab(X_train_tagged)

            # Train the model
            model_dbow.train(X_train_tagged, total_examples=model_dbow.corpus_count, epochs=100)

            # Get the vectorized titles from the doc2vec model
            vectors = [model_dbow.infer_vector(title.split()) for title in X_train.tolist()]

            # Add the vectors to the dataframe as a new column
            df_new = pd.DataFrame({'title':X_train, 'vector': vectors})
            df_new

    def vectorize(self, featureName, x):
        """Turns a sentence or list of sentences into a feature vectors"""

        if type(x) == str: return self.vectorize(featureName, [x])

        else:
            if featureName == 'BoW':
                return self.Embedding['BoW'].transform(x).toarray()



Example

In [440]:
obj = Subreddit_Predictor()
obj.add_data(df)
obj.clean_data()
obj.ready_data(test_size=.3, seed=29)

In [441]:
obj.Models_info

{}

In [442]:
obj.predictions('BoW+SVM', test_titles)

KeyError: 'BoW+SVM'

# Title Vectorizer Class

This will have all of the different vectorizers. All of the different ways to embed titles.
A key feature of this class is that there are functions which need to be added later.

In [443]:
class Title_Vectorizer:
    """This class is to hold all of the Title Vectorizers, like Bag-of-Words and Doc2Vec. Each vectorizer is a specific object. The class methods all have the same input/output."""
    def __init__(self, featureName):
        self.featureName = featureName
        self.description = "Description goes here"

    def train(self, X_train):
        """Inputs the training data. Creates the self.model"""

        self.model = self._train(X_train)

    def _train(self, X_train):
        """Just a place holder for the actual function"""
        #pass

    def vectorize(self, df_titles):
        """Given a data frame or series with only titles, will return a df of all of the features, indexed by id. The actual function will be added to each object."""

        return self._vectorize(df_titles, self.model)

    def _vectorize(self, df_titles, model):
        """Just a place holder for the actual function."""
        #pass



### Example: Bag-of-Words

In [444]:
BoW_model = Title_Vectorizer('BoW')

def _BoW_vectorize(df_titles, model):
    """I think I need to drop every word that's not in the vocabulary."""

    if type(df_titles) == pd.core.frame.DataFrame:
        titles = df_titles['title']
    else:
        titles = df_titles

    vocab = model.vocabulary_

    titles = titles.apply(lambda s: ' '.join(set(s.split()).intersection(vocab)))
    temp = model.transform(titles)
    temp = temp.toarray()
    temp = pd.DataFrame(temp)
    temp['id'] =df_titles.index
    temp = temp.set_index('id')
    return temp

def _BoW_train(X_train):
    model = CountVectorizer()
    model.fit_transform(X_train)
    return model

BoW_model._vectorize = _BoW_vectorize
BoW_model._train = _BoW_train

obj.add_title_vectorizer(BoW_model)

In [445]:
BoW_model.vectorize(test_titles)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,13126,13127,13128,13129,13130,13131,13132,13133,13134,13135
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Example: Doc2Vec

In [430]:
D2V_model = Title_Vectorizer('D2V')
#D2V_model.params = {'dm':0, 'vector_size':300, 'negative':5, 'hs':0, 'min_count':2, 'sample':0, 'epochs':100}

def _D2V_train(X_train):

    X_train_tagged = X_train.tolist()
    X_train_tagged = [TaggedDocument(words=title.split(), tags=[str(i)]) for i, title in enumerate(X_train_tagged)]

    model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0)
    model_dbow.build_vocab(X_train_tagged)

    # Train the model
    model_dbow.train(X_train_tagged, total_examples=model_dbow.corpus_count, epochs=100)

    return model_dbow

def _D2V_vectorize(df_titles, model):

    vectors = [model.infer_vector(titl.split()) for titl in df_titles.tolist()]
    df_new = pd.DataFrame({'title':df_titles, 'vector': vectors})
    df_new =df_new['vector'].apply(lambda x: pd.Series(x))

    return df_new

D2V_model._train = _D2V_train
D2V_model._vectorize = _D2V_vectorize

obj.add_title_vectorizer(D2V_model)

# Classifiers

This is the class that holds the classifiers, like XGBoost and Support Vector Machines

In [446]:
class classifier:
    """This is the class the holds the classifiers"""

    def __init__(self, classifierName):
        self.classifierName = classifierName

    def train(self, X_train, Y_train):
        """Input the X and Y training data. Then update the model"""

        self.model = self._train(X_train, Y_train)

    def _train(self, X_train, Y_train):
        """Where the real function is stored"""
        pass

    def predict(self, title_vectors):
        """
        :param title_vectors: A pandas dataframe of the vectorized titles
        :return: A pandas series with the predictions
        """

        return self._predict(title_vectors, self.model)

    def _predict(self, titles, model):
        """where the actual function is stored"""
        pass


### Example: Support Vector Machine

In [447]:
SVM_model = classifier('SVM')

def _SVM_train(X_train, Y_train):
    model = SVC()
    model.fit(X_train, Y_train)
    return model

def _SVM_predict(title_vectors, model):
    """enter a list or series or data frame of titles. Outputs prediction in a dataframe"""

    df = model.predict(title_vectors)
    print(df)
    return df

SVM_model._train = _SVM_train
SVM_model._predict = _SVM_predict

In [448]:
obj.add_classifier(SVM_model)

In [449]:
obj.train_model('BoW+SVM', 'BoW', 'SVM', description= 'Just a quick test')

In [452]:
obj.predictions('BoW+SVM', obj.X_test)

[10  2  1 ...  0  2  5]


array([10,  2,  1, ...,  0,  2,  5])

In [436]:
test_titles

Unnamed: 0,title
0,Redditors of Reddit. What is your favorite pie...
1,WIBTA if I stole my younger brothers lunch money?
2,check out this cool video I found
3,asdf
4,cats are dangerous
5,new study shows cats are dangerous
6,reddit cool aita


In [None]:

# Test the model on some new data
new_titles = ['Redditors of Reddit. What is your favorite piece of Reddit history?', 'WIBTA if I stole my younger brothers lunch money?', 'check out this cool video I found', 'asdf', 'cats are dangerous', 'new study shows cats are dangerous']
new_vectors = Embedding[featureName].transform(new_titles)

new_predictions = Models[(featureName, classifierName)].predict(new_vectors)

output = pd.DataFrame({'title': new_titles, 'Prediction':new_predictions})
output['Prediction'] = le.inverse_transform(output['Prediction'])
output



Models[(featureName, classifierName)] = SVC()
Models[(featureName, classifierName)].fit(Features[featureName], Y_train)

In [395]:
D2V_model.vectorize(obj.X_train)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
zmng91,0.124582,0.243396,-0.285233,-0.292657,-0.014337,0.445207,-0.584179,0.101635,0.353450,-0.048465,...,-0.243792,-0.224315,-0.410018,-0.119891,0.138223,0.247556,-0.099369,0.422833,-0.377174,-0.098506
zrve0c,-0.004741,-0.048354,-0.019651,0.165643,0.224330,-0.055640,0.132605,0.051793,0.226556,0.173038,...,-0.269658,0.125007,0.144099,-0.173289,0.147958,0.399452,-0.271192,0.050419,-0.114187,-0.187126
zppddb,-0.179380,-0.124213,-0.161055,-0.325073,0.329091,0.191743,-0.130025,-0.047308,0.119915,-0.075613,...,0.136019,-0.035714,0.208921,0.114411,0.384664,0.022161,-0.138315,-0.147456,-0.063236,-0.258442
z4m4c6,0.250896,0.154340,-0.385988,-0.629014,-0.081034,0.143091,-0.314824,0.227359,0.227642,0.256662,...,-0.277127,-0.193340,0.011218,-0.225516,-0.167319,0.141716,-0.445301,0.008907,0.006306,0.060325
zeb9r7,-0.146872,-0.105383,-0.286352,-0.282161,-0.200312,0.248168,0.129567,0.261246,0.109793,-0.123065,...,-0.065415,-0.211845,-0.109918,-0.269352,0.092641,-0.232515,-0.216349,-0.031634,-0.097747,0.265720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zo36za,-0.155573,-0.220176,-0.221153,-0.198772,0.298348,0.069892,-0.002548,0.319032,0.114219,-0.163150,...,-0.032943,-0.141069,0.084034,-0.089177,0.038817,0.289343,-0.071765,-0.141827,-0.135069,-0.143151
zog02f,-0.059888,0.015476,-0.141179,-0.296559,0.308300,0.254743,-0.255748,-0.029594,-0.051015,0.067890,...,-0.001212,-0.005342,-0.233871,-0.124089,0.124338,0.284089,-0.244712,0.168567,-0.076076,-0.086543
zophno,0.183705,-0.059724,-0.451903,-0.299384,0.058367,-0.039836,0.067061,0.238934,0.267701,-0.052739,...,0.139718,0.125756,0.168113,0.080182,0.112720,0.434177,-0.110954,-0.192290,0.096343,0.133488
z7sghp,-0.144103,-0.134290,-0.347879,-0.160396,-0.153021,0.074020,-0.274817,-0.007368,0.148038,-0.135822,...,-0.145126,0.029284,-0.306107,0.214905,-0.032107,0.199594,-0.493793,-0.173641,-0.092477,0.060075


In [396]:
D2V_model.vectorize(obj.X_test)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
zneu4t,0.024756,0.017508,-0.245544,-0.079161,0.141099,0.425426,-0.383731,0.320392,0.369929,-0.004070,...,-0.278416,-0.357967,-0.324445,-0.079764,0.312804,0.166880,-0.082328,0.328065,-0.143884,-0.183912
zqdj2z,-0.000625,0.000360,0.001490,-0.001408,0.000112,-0.000787,0.000359,-0.000465,0.000478,-0.000510,...,-0.001632,-0.001583,0.000785,0.001610,-0.001610,0.000617,-0.000850,-0.000598,-0.000673,0.000851
zsoaht,-0.273645,-0.228289,-0.279748,-0.255452,0.641437,0.180611,-0.225046,-0.225095,-0.011575,0.265786,...,0.044403,0.123269,-0.026987,-0.031384,-0.008669,0.659682,0.065079,0.043444,-0.239513,-0.131342
zpb6mm,-0.316552,-0.127628,-0.134513,0.232185,0.249365,0.035668,-0.225528,-0.025776,0.085983,-0.002724,...,-0.088556,0.144621,-0.151302,0.352720,0.141440,0.626190,0.123271,0.179211,-0.457471,-0.166082
zspweu,-0.124831,0.069495,-0.500268,-0.328436,0.222820,-0.153912,-0.281175,0.128660,-0.124410,-0.141062,...,-0.376867,-0.108806,-0.102207,-0.082202,0.053004,0.440408,-0.040838,0.061226,-0.162835,-0.175296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zqh63h,0.090006,0.078147,-0.342505,-0.209351,0.077070,0.176495,-0.168447,0.053810,0.359074,-0.024932,...,0.021485,-0.013352,-0.016688,-0.125820,0.134182,-0.036878,-0.171178,0.098551,0.019160,-0.018199
zozmfd,0.446083,0.108086,-0.116243,-0.301169,0.115406,0.013130,-0.124489,0.038892,0.269257,0.020708,...,-0.063685,-0.211680,0.072351,-0.124213,0.098303,0.031621,-0.323328,0.284245,0.051450,-0.088354
zsmn5w,-0.018422,-0.173283,0.053598,0.025485,-0.053987,0.163447,-0.030172,-0.011530,0.143444,0.160024,...,-0.056186,-0.160709,0.140689,-0.118233,0.107463,0.405808,-0.052695,0.201463,-0.417349,-0.383937
yta4a2,0.045882,0.158547,-0.323433,0.114190,-0.055485,0.081891,-0.430319,-0.038523,0.080145,-0.146721,...,-0.022787,0.102703,-0.125879,-0.235870,0.015697,0.134702,0.041802,-0.008966,-0.146478,-0.150347


In [375]:

D2V_model.vectorize(test_titles['title'])

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [None]:
           # Create a list of TaggedDocument objects from the titles
X_train_tagged = self.X_train.tolist()
X_train_tagged = [TaggedDocument(words=title.split(), tags=[str(i)]) for i, title in enumerate(X_train_tagged)]
X_test_tagged = self.X_test.tolist()
X_test_tagged = [TaggedDocument(words=title.split(), tags=[str(i)]) for i, title in enumerate(X_test_tagged)]

model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0)
model_dbow.build_vocab(X_train_tagged)

# Train the model
model_dbow.train(X_train_tagged, total_examples=model_dbow.corpus_count, epochs=100)

# Get the vectorized titles from the doc2vec model
vectors = [model_dbow.infer_vector(title.split()) for title in X_train.tolist()]

# Add the vectors to the dataframe as a new column
df_new = pd.DataFrame({'title':X_train, 'vector': vectors})
df_new

In [353]:
BoW_model = Title_Vectorizer('BoW')
BoW_model._vectorize = _BoW_vectorize
BoW_model._train = _BoW_train
#BoW_model.train(obj.X_train)
#BoW_model.vectorize(obj.X_train)


In [330]:
#_BoW_train(obj.X_train)
BoW_model._train = _BoW_train
BoW_model.train(obj.X_train)
type(BoW_model.model)

sklearn.feature_extraction.text.CountVectorizer

In [307]:
type(BoW_model.model)

NoneType

In [283]:
BoW_model.model.transform(list(test_titles['title'])).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [203]:
import pandas as pd

# Create a sample pandas series
s = pd.Series(['I love dogs', 'I hate cats', 'I like turtles'])

# Create a vocabulary
vocab = ['I', 'love', 'hate', 'like']

# Remove words from the sentences that are not in the vocabulary
filtered_s = s.apply(lambda x: ' '.join([word for word in x.split() if word in vocab]))

# Print the filtered series
print(filtered_s)
# Remove words from the sentences that are not in the vocabulary
filtered_s = s.apply(lambda x: ' '.join(set(x.split()).intersection(vocab)))

# Print the filtered series
print(filtered_s)


0    I love
1    I hate
2    I like
dtype: object
0    I love
1    I hate
2    I like
dtype: object


In [190]:
x = CountVectorizer()
x.fit_transform(obj.X_train)
vocab = x.vocabulary_
'im' in vocab

True

In [277]:
test_titles

Unnamed: 0,title
0,Redditors of Reddit. What is your favorite pie...
1,WIBTA if I stole my younger brothers lunch money?
2,check out this cool video I found
3,asdf
4,cats are dangerous
5,new study shows cats are dangerous
6,reddit cool aita


In [189]:
BoW_model.vectorize(pd.DataFrame({'title':test_titles}))

AttributeError: 'Series' object has no attribute 'split'

In [157]:
pd.DataFrame(x, obj.X_train.index).info()

<class 'pandas.core.frame.DataFrame'>
Index: 6477 entries, zmng91 to zodvmd
Columns: 13136 entries, 0 to 13135
dtypes: int64(13136)
memory usage: 649.4+ MB


In [149]:
pd.DataFrame({'title':obj.X_train, 'vector': x})

ValueError: Per-column arrays must each be 1-dimensional

In [354]:
obj = Subreddit_Predictor()
obj.add_data(df)
obj.clean_data()
obj.ready_data(test_size=.3, seed=29)
obj.add_title_vectorizer(BoW_model)


In [357]:
obj.Feature_Vectors['BoW']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,13126,13127,13128,13129,13130,13131,13132,13133,13134,13135
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
zmng91,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zrve0c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zppddb,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
z4m4c6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zeb9r7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zo36za,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zog02f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zophno,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
z7sghp,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [159]:
def foo(x):
    print ('hello',x)

obj.fun = foo

obj.fun(2)

hello 2


In [9]:




# Convert the labels to numerical values
le = LabelEncoder()
df['subreddit_num'] = le.fit_transform(df['subreddit'])

df = df.drop(columns=['subreddit'])

#df['subreddit'] = le.inverse_transform(df['subreddit_num'])

df


Unnamed: 0,id,title,subreddit
0,t93ec3,This subreddit is closed for new posts and com...,announcements
1,pg006s,COVID denialism and policy clarifications,announcements
2,pbmy5y,"Debate, dissent, and protest on Reddit",announcements
3,nw2hs6,Sunsetting Secret Santa and Reddit Gifts,announcements
4,mi01fg,Second,announcements
...,...,...,...
9261,zq0n2b,WIBTA For Exposing My Dad to My Mom?,AmItheAsshole
9262,zq0kzb,AITA for trying to rescue/take home/whatever a...,AmItheAsshole
9263,zq0kv9,AITA for not wanting to gift hotel soaps for C...,AmItheAsshole
9264,zq0k55,AITA for walking my dog on my own street?,AmItheAsshole


In [65]:
df_new = pd.DataFrame({'id':['pg006s'], 'title':[a], 'subreddit':['announcements']}).set_index('id')

In [66]:
pd.concat([df_new, df]).drop_duplicates(keep = False)

Unnamed: 0_level_0,title,subreddit
id,Unnamed: 1_level_1,Unnamed: 2_level_1
pg006s,COVID denialism and policy clarifications,announcements
pbmy5y,"Debate, dissent, and protest on Reddit",announcements
nw2hs6,Sunsetting Secret Santa and Reddit Gifts,announcements
mi01fg,Second,announcements
mcisdf,An update on the recent issues surrounding a R...,announcements
...,...,...
zq0n2b,WIBTA For Exposing My Dad to My Mom?,AmItheAsshole
zq0kzb,AITA for trying to rescue/take home/whatever a...,AmItheAsshole
zq0kv9,AITA for not wanting to gift hotel soaps for C...,AmItheAsshole
zq0k55,AITA for walking my dog on my own street?,AmItheAsshole


In [71]:
df.drop_duplicates(keep = 'first')

Unnamed: 0_level_0,title,subreddit
id,Unnamed: 1_level_1,Unnamed: 2_level_1
t93ec3,This subreddit is closed for new posts and com...,announcements
pg006s,COVID denialism and policy clarifications,announcements
pbmy5y,"Debate, dissent, and protest on Reddit",announcements
nw2hs6,Sunsetting Secret Santa and Reddit Gifts,announcements
mi01fg,Second,announcements
...,...,...
zq0n2b,WIBTA For Exposing My Dad to My Mom?,AmItheAsshole
zq0kzb,AITA for trying to rescue/take home/whatever a...,AmItheAsshole
zq0kv9,AITA for not wanting to gift hotel soaps for C...,AmItheAsshole
zq0k55,AITA for walking my dog on my own street?,AmItheAsshole


In [73]:
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({'A': [1, 2, 2, 3, 3], 'B': [4, 5, 5, 6, 6], 'C': [7, 8, 8, 9, 9]})

# Find duplicate rows
duplicate_rows = df[df.duplicated()]

# Print the duplicate rows
print(duplicate_rows)


   A  B  C
2  2  5  8
4  3  6  9


In [78]:
df[df.duplicated()]

Unnamed: 0_level_0,title,subreddit
id,Unnamed: 1_level_1,Unnamed: 2_level_1
c0gl6,"We are aware that reddit appears hung over, an...",announcements
zsrwxs,What made you want to have kids?,AskReddit
zspw41,What do you want for Christmas?,AskReddit
zsppnr,What made you want to have kids?,AskReddit
zspbbv,What do you want for Christmas?,AskReddit
zsp834,What made you want to have kids?,AskReddit
zso4w7,What is on your Christmas wishlist?,AskReddit
zso42r,People who have their desserts before their ma...,AskReddit
zshreq,does crashing and desabling gpu driver means t...,gaming
zs6y9p,Kingdoms of Amalur: Re-Reckoning worth $12?,gaming
