In [1]:
import pandas as pd
import praw
import secrets
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import copy

# Class: Subreddit_Predictor

Objects of this class contain attributes and Methods that can be broken up into three categories: **Data**, **Collections**, and **Processing**

## Data
Pandas DataFrames and methods to update and clean the data.

**Attributes:**

| Name      |   Type    |             Description             |
|:----------|:---------:|:-----------------------------------|
| raw_data  | DataFrame |      The raw unprocessed data       |
| full_data | DataFrame |         The processed data          |
| subreddits |   list    | A list of subreddits. Extraced from full_data. |
| X_train   | pd.Series | the X portion of the training data  |
| Y_train   | np.array  | the Y portion of the training data  |
| X_test    | pd.Series |   the X portion of the test data    |
| Y_test    | np.array  |   the Y portion of the test data    |

**Methods:**

| Name (with input/output typing) | Description                                                                                                                               |
|---------------------------------| ------------------------------------------------------------------------------------------------------------------------------------------|
| add_data(df: DataFrame)         | Updates the raw_data attribute                                                                                                            |
| ready_data()                    | Cleans the data and does a test train split. <br/> Overwrites the full_data attribute. <br/> Creates the X_train, Y_train, X_test, and Y_test attributes. |

## Collections
Contains dictionaries of vectorizers, classifiers, and models

**Attributes:**

| Name                         | Type      | Description                                                                                                                                                                                                                                                                                                                                                                                                                        |
|:-----------------------------|:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Vectorizers                  | dict      | Dictionary of Vectorizer objects                                                                                                                                                                                                                                                                                                                                                                                                   |
| Feature_Vectors              | dict      | Dictionary of the vectorized full_data                                                                                                                                                                                                                                                                                                                                                                                             |
| Classifiers                  | dict      | Dictionary of Classifier objects                                                                                                                                                                                                                                                                                                                                                                                                   |
| Models                       | dict      | Dictionary of trained Classifier objects                                                                                                                                                                                                                                                                                                                                                                                           |
| Models_info                  | dict      | Dictionary containing a description of each model in Models.                                                                                                                                                                                                                                                                                                                                                                       |
| Predictions                  | DataFrame | A DataFrame with all the titles and actual subreddits in X_test and Y_test <br/> There is a column for each model that has the predictions                                                                                                                                                                                                                                                                                         |
| Results                      | DataFrame | A DataFrame. Each row represents a trained and tested model. There is 1 column for each subreddit and an additional column called 'Total'. Values in teh first columns are the Individual Recall Scores. The value in the last column is the Total Accuracy Score.                                                                                                                                                                 |
| Cross_Validation_Results     | DataFrame | A DataFrame. Each row represents a single run of Randomly Resampled and Stratified Cross-Validation. Each row is indexed by both the 'Model' and the 'Test No.', telling us whether this was the 1st resamplying or the 5th. There is 1 column for each subreddit and an additional column called 'Total'. Values in teh first columns are the Individual Recall Scores. The value in the last column is the Total Accuracy Score. |
| Cross_Validation_Results_avg | DataFrame | A DataFrame. Each row represents a Randomly Resampled and Stratified Cross-Validated model. Each row is indexed by both only 'Model'. There is 1 column for each subreddit and an additional column called 'Total'. Values in all columns are the Mean values from Cross-Validation_Results.                                                                                                                                       |
| Cross_Validation_Results_std | DataFrame | A DataFrame. Each row represents a Randomly Resampled and Stratified Cross-Validated model. Each row is indexed by both only 'Model'. There is 1 column for each subreddit and an additional column called 'Total'. Values in all columns are the Standard Deviation values from Cross-Validation_Results.                                                                                                                         |

**Methods:**

| Name (with input/output typing)                                                                                  | Description                                                                                                                                                                                |
|------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| add_vectorizer(model: Vectorizer)                                                                                | Trains the vectorizer. <br/>Adds (key = model.name, value = model) to Vectorizers                                                                                                               |
| add_feature_vectors(vectorizerName: str)| Embeds X_train as vectors and creates a dataframe for the new feature vectors. <br/> Adds this dataFrame to Feature_Vectors |
| add_classifier(model: Classifier)                                                                                | Adds (key = model.name, value = model) to Classifiers                                                                                                                                      |
| train_model(<br/>modelName: str, <br/>vectorizerName: str, <br/>classifierName: str, <br/>description = '' :str) | Takes vectorizer and classifer from Vectorizers and Classifiers. <br/>Trains the classifier.<br/>Names and adds the trained model to Models.<br/>Adds the description text to Models_info. |
| test_model(modelName: str) | Runs the model against X_test and Y_test. <br/> Updates Predictions and Results                                                                                                            |

## Processing

**Methods:**

| Name (with input/output typing)                       | Description                                                                                   |
|-------------------------------------------------------|-----------------------------------------------------------------------------------------------|
| predict(modelName:str, title: str, titles: iter[str]) | Given a model, enter a title or a list/dataframe of titles. Will return the model prediction. |
| compare(models: list[str])                            | Creates a bar chart comparing each of the models on each of the subreddits. |                  |

In [2]:
class Subreddit_Predictor:

    def __init__(self):

        #Data
        self.raw_data = pd.DataFrame({'id': [], 'title': [], 'subreddit': []})
        self.full_data = pd.DataFrame({'id': [], 'title': [], 'subreddit': []}).set_index('id')
        self.ALL_FEATURES = self.full_data.copy()
        self.subreddits = []

        #Collections
        self.Vectorizers = {}
        self.Feature_Vectors = {}
        self.Classifiers = {}
        self.Models = {}
        self.Models_info = {}
        #self.Predictions = {}
        #self.Results = {}

    #### Data ####

    def add_data(self, df):
        """df is a pandas DataFrame with columns={'title':[], 'subreddit':[]}. It will be merged with the existing raw_data"""
        self.raw_data = pd.concat([self.raw_data, df]).drop_duplicates(subset='id')

    def ready_data(self, test_size=.2, seed=42):
        """Splits and encodes the data. Saves is in X_train, Y_train, X_test, Y_test."""

        # Clean the data
        self._preclean_data()

        # Update the subreddits attribute
        self.subreddits = self.full_data['subreddit'].unique().tolist()

        # Encode the subreddits
        self._le = LabelEncoder()
        self._le.fit(self.subreddits)

        self.full_data['subreddit_num'] = self._le.transform(self.full_data['subreddit'])

        # Split the data
        input = self.full_data['title']
        output = self.full_data['subreddit_num']
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(input, output, test_size=test_size, random_state=seed, stratify=output)

        # Record the indices
        self._train_index = self.X_train.index
        self._test_index = self.X_test.index

        # Update Predictions
        df = self.full_data
        df = df.drop(columns=['subreddit_num'])
        df = df.loc[obj._test_index]
        df = df.rename(columns={'subreddit': 'actual'})
        self.Predictions = df

        # Update Results, Cross-Validation_Results, Cross-Validation_Results_avg, and Cross-Validation_Results_std
        self.Results = pd.DataFrame(columns=['Model'] + self.subreddits + ['Total']).set_index('Model')
        self.Cross_Validation_Results = pd.DataFrame(columns=['Model', 'Test No.'] + self.subreddits + ['Total']).set_index(['Model', 'Test No.'])
        self.Cross_Validation_Results_avg = pd.DataFrame(columns=['Model'] + self.subreddits + ['Total']).set_index('Model')
        self.Cross_Validation_Results_std = pd.DataFrame(columns=['Model'] + self.subreddits + ['Total']).set_index('Model')

        # Update ErrorBars
        cols = ['Model']
        for sub in self.subreddits:
            cols.append(str(sub) + ': avg')
            cols.append(str(sub) + ': std')
        cols.append('Total: avg')
        cols.append('Total: std')
        self.ErrorBars = pd.DataFrame(columns=cols).set_index('Model')

        # Get the basic ALL_FEATURES
        self.ALL_FEATURES = self.full_data.copy()


    def _preclean_data(self):
        """Cleans the data in raw_data and updates self.data"""

        df = self.raw_data

        # Remove all non-alpha-numeric characters
        #df['title'] = df['title'].str.replace(r'[^a-zA-Z0-9 ]', '', regex=True)

        # Make all the text lowercase
        #df['title'] = df['title'].str.lower()

        # Remove empty rows
        df['title'] = df['title'].str.strip()
        filter = df['title'] == ''
        df = df.drop(df[filter].index)

        # Store it as
        self.full_data = df

        # Change the index
        self.full_data = self.full_data.set_index('id')

    #### Collections ####

    def add_vectorizer(self, vectorizer):
        """This is how we add a vectorizers to our collection"""
        vectorizer.train(self.X_train)
        self.Vectorizers[vectorizer.vectorizerName] = vectorizer

    def add_feature_vectors(self, vectorizerName):
        """Updates Feature_Vectors"""
        vectorizer = self.Vectorizers[vectorizerName]
        self.Feature_Vectors[vectorizerName] = vectorizer.embed(self.full_data['title'])

    def add_classifier(self, classifier):
        """We add the classifier to our collection, self.Classifiers"""
        self.Classifiers[classifier.classifierName] = classifier

    def new_train_model(self, modelName, feature_list, classifierName, description=''):
        """
        :param modelName: The name of this model
        :param vectorizerName: Which feature vectors are we using?
        :param classifierName: Which classifier are we using?
        :param description: Write a short discription of the model (optional).
        :return: Adds a trained object of the classifier class to self.Models
        """

        self.Models_info[modelName] = {'feature_list': feature_list, 'classifierName': classifierName,
                                       'description': description}

        X_train = self.ALL_FEATURES.loc[self._train_index, feature_list]
        #print(X_train.info())
        #.loc[self._train_index]
        Y_train = self.Y_train
        classifier = copy.copy(self.Classifiers[classifierName])
        classifier.train(X_train, Y_train)

        self.Models[modelName] = classifier


    def new_test_model(self, modelName):
        """Tests the model using X_test and Y_test. Updates Predictions and Results"""

        model = self.Models[modelName]
        feature_list = self.Models_info[modelName]['feature_list']

        X_test = self.ALL_FEATURES.loc[self._test_index, feature_list]
        #print(X_test)

        yhat = model.predict(X_test)
        yhat = self._le.inverse_transform(yhat)
        self.Predictions[modelName] = yhat

        df = self.Predictions

        # Get Individual Recall Score
        row = {}
        for actual_sub in self.subreddits:
            filter = df['actual'] == actual_sub
            temp_df = df[filter]
            total = len(temp_df)
            correct = len(temp_df[temp_df[modelName] == actual_sub])
            row[actual_sub] = correct/total

        row['Total'] = np.mean(list(row.values()))

        self.Results.loc[modelName] = row

    def train_model(self, modelName, vectorizerName, classifierName, description=''):
        """
        :param modelName: The name of this model
        :param vectorizerName: Which feature vectors are we using?
        :param classifierName: Which classifier are we using?
        :param description: Write a short discription of the model (optional).
        :return: Adds a trained object of the classifier class to self.Models
        """

        self.Models_info[modelName] = {'vectorizerName': vectorizerName, 'classifierName': classifierName,
                                       'description': description}

        X_train = self.Feature_Vectors[vectorizerName].loc[self._train_index]
        Y_train = self.Y_train
        classifier = self.Classifiers[classifierName]
        classifier.train(X_train, Y_train)

        self.Models[modelName] = classifier

    def test_model(self, modelName):
        """Tests the model using X_test and Y_test. Updates Predictions and Results"""

        model = self.Models[modelName]

        vectorizerName = self.Models_info[modelName]['vectorizerName']
        X_test = self.Feature_Vectors[vectorizerName].loc[self._test_index]

        yhat = model.predict(X_test)
        yhat = self._le.inverse_transform(yhat)
        self.Predictions[modelName] = yhat

        df = self.Predictions

        # Get Individual Recall Score
        row = {}
        for actual_sub in self.subreddits:
            filter = df['actual'] == actual_sub
            temp_df = df[filter]
            total = len(temp_df)
            correct = len(temp_df[temp_df[modelName] == actual_sub])
            row[actual_sub] = correct/total

        row['Total'] = np.mean(list(row.values()))

        self.Results.loc[modelName] = row

        #The following is old code I don't use anymore, but may bring back. It saves the missclassification matrix to self.Results
        """
        results_matrix = pd.DataFrame(columns=self.subreddits)
        for actual_sub in self.subreddits:
            row = []
            filter = df['actual'] == actual_sub
            temp_df = df[filter]
            total = len(temp_df)
            for predicted_sub in self.subreddits:
                filter = temp_df[modelName] == predicted_sub
                count = len(temp_df[filter])
                row.append(count / total)

                #print('actual_sub', actual_sub, 'predicted_sub', predicted_sub)

            results_matrix.loc[actual_sub] = row

        self.Results[modelName] = results_matrix
        """

    def validate(self, modelName, num_epocs):

        classifierName = self.Models_info[modelName]['classifierName']
        vectorizerName = self.Models_info[modelName]['vectorizerName']

        X_train = self.Feature_Vectors[vectorizerName].loc[self._train_index]
        Y_train = self.Y_train
        classifier = self.Classifiers[classifierName]

        for i in tqdm(range(num_epocs)):

            try:
                test_num = 1 + max(list(self.Cross_Validation_Results.loc[modelName].index))
            except KeyError:
                test_num = 1

            X_train_inner, X_test_inner, Y_train_inner, Y_test_inner = train_test_split(X_train, Y_train, test_size=.15, random_state=test_num, stratify=Y_train)

            classifier.train(X_train_inner, Y_train_inner)
            model = classifier


            yhat = model.predict(X_test_inner)
            yhat = self._le.inverse_transform(yhat)
            df = pd.DataFrame({'actual': Y_test_inner, 'prediction':yhat})
            print(df)

            # Get Individual Recall Score
            row = {}
            for actual_sub in self.subreddits:
                filter = df['actual'] == actual_sub
                temp_df = df[filter]
                total = len(temp_df)
                correct = len(temp_df[temp_df['prediction'] == actual_sub])
                row[actual_sub] = correct/total

            row['Total'] = np.mean(list(row.values()))

            self.Cross_Validation_Results.loc[(modelName, test_num)] = row
            """
            df = self.Predictions

            # Get Individual Recall Score
            row = {}
            for actual_sub in self.subreddits:
                filter = df['actual'] == actual_sub
            temp_df = df[filter]
            total = len(temp_df)
            correct = len(temp_df[temp_df[modelName] == actual_sub])
            row[actual_sub] = correct/total

            row['Total'] = np.mean(list(row.values()))

            self.Results.loc[modelName] = row

            X_train = self.Feature_Vectors[vectorizerName].loc[self._train_index]

            Y_train = self.Y_train
            classifier = self.Classifiers[classifierName]
            classifier.train(X_train, Y_train)

            """

    #### Processing ####

    def predict(self, modelName, titles):
        """
        :param modelName: Which model are we using?
        :param titles: A list or series of titles
        :return: A data frame of 'title' and 'prediction'
        """

        model = self.Models[modelName]

        vectorizerName = self.Models_info[modelName]['vectorizerName']
        vectorizer = self.Vectorizers[vectorizerName]

        title_vectors = vectorizer.embed(titles)

        array = model.predict(title_vectors)
        df = pd.DataFrame(array)
        df['title'] = titles
        df['prediction'] = self._le.inverse_transform(array)
        df = df.drop(columns=[0])
        return df



    def compare(self, list_of_models, error_bars = False):
        """Creates a bar chart comparing the predictions from a list of models"""

        num_models = len(list_of_models)

        # Set the width of the bars
        bar_width = 1 / (num_models + 1)

        #Get the ticks position
        xticks = list(range(len(self.subreddits)+1))

        # Add a gap for the 'Total' category
        xticks[-1] += .5
        #print('xticks: ', xticks)

        #Get the name of all the categories
        categories = list(self.Results.columns)
        for i, cat in list(enumerate(categories))[:-1]:
            categories[i] = 'r/'+cat
        #print('categories:', categories)

        #Get the relative bar positions right
        left_start = bar_width*(len(list_of_models)-1)/2

        fig, ax1 = plt.subplots()
        ax2 = ax1.twinx()


        for j, model in enumerate(list_of_models):
            values = list(self.Results.loc[model])
            #print('values: ', values)

            x_pos = [i - left_start + j*bar_width for i in xticks]
            #print('x_pos: ', x_pos)


            ax1.bar(x_pos, values, bar_width, label=model)

            if error_bars:
                if model in self.Cross_Validation_Results_std.index:
                    mid_point = list(self.Cross_Validation_Results_avg.loc[model])
                    radius = list(self.Cross_Validation_Results_std.loc[model].apply(lambda x: 2*x))
                    ax1.errorbar(x_pos, mid_point, yerr = radius, fmt = 'none', ecolor='gray',  elinewidth=2, capsize=5)



        # Set the x-axis tick labels
        ax1.set_xticks(xticks)
        ax1.set_xticklabels(categories, rotation=45, ha='right', rotation_mode = 'anchor')

        # Add a title and axis labels
        plt.title('Recall and Accuracy Comparison')
        plt.xlabel('Subreddit')
        ax1.set_ylabel('Individual Recall Scores')
        ax2.set_ylabel("Total Accuracy Score", rotation = -90)
        ax2.get_yaxis().set_label_coords(1.1,0.5)

        ax1.set_ylim(0,1.1)
        ax2.set_ylim(0,1.1)

        # Add a legend
        ax1.legend()

        # Show the plot
        plt.show()

# Class: Vectorizer

Objects of this class are vectorizers, like Bag-of-Words or Doc2Vec. They have very few attributes and methods.
All of the attributes and methods will be overwritten by each object of this class.

**Attributes:**

| Name           | Type | Description                                                                                         |
|----------------|------|-----------------------------------------------------------------------------------------------------|
| vectorizerName | str | The name of this vectorizer. <br/> This will be the key for any dictionaries containing it.         |
| description    | str | A breif discription of what this vectorizer is/does. <br/>Put the parameters here if there are any. |
| model | Other | The actual model. Typically an object of a class like Gensim or SCM

**Methods:**

| Name                               | Description                                                                                          |
|------------------------------------|------------------------------------------------------------------------------------------------------|
| train(X_train: Series)             | Uses the training data to train the model.                                                           |
| embed(titles: Series) -> DataFrame | Takes a list/dataFrame of titles and returns a DataFrame of the embeddings for each of them. |



In [3]:
class Vectorizer:
    """This class is to hold all of the Title Vectorizers, like Bag-of-Words and Doc2Vec. Each vectorizer is a specific object. The class methods all have the same input/output."""

    def __init__(self, vectorizerName):
        self.vectorizerName = vectorizerName
        self.description = "Description goes here"

    def train(self, X_train):
        """Inputs the training data. Creates the self.model"""

        self.model = self._train(X_train)

    def _train(self, X_train):
        """Just a place holder for the actual function"""
        pass

    def embed(self, titles):
        """Given a data frame or series with only titles, will return a df of all of the features, indexed by id. The actual function will be added to each object."""

        df =  self._embed(titles, self.model)

        return df


    def _embed(self, titles, model):
        """Just a place holder for the actual function."""
        pass



# Class: Classifier

This class holds the classifiers, like XGBoost and Support Vector Machines.
It also has very few attributes and methods.

**Attributes:**

|Name | Type | Description |
|-----|-------|--------|
| classifierName | str | The name of this classifier |
| description | str | A brief description of this classifier |
| model | Other | Where the actual model is stored. Typically a member of a totally different class. |

**Methods:**

|Name | Description|
|-----| --------|
|train(X_train: pd.Series, Y_train: np.array) | Trains the model |
|predict(titles: pd.Series) | predicts where each title should go |

In [4]:
class Classifier:
    """This is the class the holds the classifiers"""

    def __init__(self, classifierName):
        self.classifierName = classifierName

    def train(self, X_train, Y_train):
        """Input the X and Y training data. Then update the model"""

        self.model = self._train(X_train, Y_train)

    def _train(self, X_train, Y_train):
        """Where the real function is stored"""
        pass

    def predict(self, title_vectors):
        """
        :param title_vectors: A pandas dataframe of the vectorized titles
        :return: A pandas series with the predictions
        """

        return self._predict(title_vectors, self.model)

    def _predict(self, titles, model):
        """where the actual function is stored"""
        pass
