# Step 1: import subreddit data

In [1]:
def import_data_subreddit(latest_date = '1 July, 2020'):
    """
    docstring: Puling the subreddit data from "GlobalWarming" and "ConspiracyTheory"
    and save to /datasets folder.
    param: latest_date to pull data from
    """
    #imports
    import sys
    sys.path.insert(1, '../assets')
    import requests
    import pandas as pd
    import time
    from get_reddit_posts import get_reddit_posts

    #############################################
    #Pulling the Global Warming subreddit dataset


    par = {"subreddit": "GlobalWarming", #The subreddit title
           "post_num": 2000, # Numer of posts to pull from
            "time_1": int(time.mktime(time.strptime(latest_date, '%d %B, %Y'))), # The latest pull time
           "API_limit": 100, # API pull number limits for reddit per time
           "API_wait": 1 #API wait time berfore the next pull
          }



    climate_change = get_reddit_posts(par["subreddit"], par["post_num"], par["time_1"], par[ "API_limit"], par["API_wait"])

    file_path = "../datasets/" + par["subreddit"] + "_raw" + ".csv"
    climate_change.to_csv(file_path)
    
    time.sleep(par["API_limit"]) #Wait for a minute


    ######################################
    #Pulling the Conspiracy theory dataset

    par = {"subreddit": "ConspiracyTheory", #The subreddit title
           "post_num": 600, # Number of posts to pull from (limited reddit)
            "time_1": int(time.mktime(time.strptime(latest_date, '%d %B, %Y'))), # The latest pull time
           "API_limit": 100, # API pull number limits for reddit per time
           "API_wait": 1 #API wait time berfore the next pull
          }


    cons_theory = get_reddit_posts(par["subreddit"], par["post_num"], par["time_1"], par[ "API_limit"], par["API_wait"])

    file_path = "../datasets/" + par["subreddit"] + "_raw" + ".csv"
    cons_theory.to_csv(file_path)

100 posts downloaded, oldest post:2020-04-07 14:38:14 - status code: 200, now waiting 1 seconds before next pull. Patience...
100 posts downloaded, oldest post:2020-01-28 18:03:38 - status code: 200, now waiting 1 seconds before next pull. Patience...
100 posts downloaded, oldest post:2019-12-27 14:02:47 - status code: 200, now waiting 1 seconds before next pull. Patience...
100 posts downloaded, oldest post:2019-11-04 02:20:22 - status code: 200, now waiting 1 seconds before next pull. Patience...
100 posts downloaded, oldest post:2019-09-29 11:52:17 - status code: 200, now waiting 1 seconds before next pull. Patience...
100 posts downloaded, oldest post:2019-08-26 02:58:05 - status code: 200, now waiting 1 seconds before next pull. Patience...
100 posts downloaded, oldest post:2019-07-25 10:04:50 - status code: 200, now waiting 1 seconds before next pull. Patience...
100 posts downloaded, oldest post:2019-06-19 18:21:40 - status code: 200, now waiting 1 seconds before next pull. Pati

# Step 2: clean the dataset (major cleaning are on NLP end)

In [2]:
def clean_data_subreddit():
    """
    docstring: Cleaning the imported data and pickling the cleaned dataframe
    """
    #imports
    import pandas as pd
    import regex as re
    import warnings
    warnings.filterwarnings('ignore')
    from nltk.corpus import stopwords # Import the stopword list

    from nltk.stem.porter import PorterStemmer
    from nltk.stem import WordNetLemmatizer

    import pickle

    ####################Global warming#####

    file_path = "../datasets/" + "GlobalWarming" + "_raw" + ".csv"
    df_gw = pd.read_csv(file_path)

    # Keeping only a few columns which will be helpful during analysis
    to_keep_clmns = ['author', 'created_utc', 'domain', 'id', 'num_comments', 'over_18',
           'post_hint', 'score', 'selftext',
           'title']

    df_gw_clean = df_gw[to_keep_clmns]

    #For title and selftext columns, I filled them with " " as they will be striped later, so I can merge them later.

    df_gw_clean["title"].fillna(" ", inplace=True)
    df_gw_clean["selftext"].fillna(" ", inplace=True)

    #Merging the title and selftext for further processing

    df_gw_clean['text_merged'] = df_gw_clean['title'] + " " + df_gw_clean['selftext']
    df_gw_clean.drop(columns = ["title", "selftext"], inplace=True)

    #For post_hint, I imputed them with "Empty"
    df_gw_clean['post_hint'].fillna("Empty", inplace=True)

    ##################### ConspiracyTheory#######

    file_path = "../datasets/" + "ConspiracyTheory" + "_raw" + ".csv"
    df_ct = pd.read_csv(file_path)

    # Keeping only a few columns which will be helpful during analysis
    df_ct_clean = df_ct[to_keep_clmns]


    #For title and selftext columns, I filled them with " " as they will be striped later, so I can merge them later.

    df_ct_clean["title"].fillna(" ", inplace=True)
    df_ct_clean["selftext"].fillna(" ", inplace=True)

    #Merging the title and selftext for further processing

    df_ct_clean['text_merged'] = df_ct_clean['title'] + " " + df_ct_clean['selftext']
    df_ct_clean.drop(columns = ["title", "selftext"], inplace=True)

    #For post_hint, I imputed them with "Empty"
    df_ct_clean['post_hint'].fillna("Empty", inplace=True)

    #########################MErging the two columns

    df_gw_clean["subreddit"] = "GlobalWarming"
    df_ct_clean["subreddit"] = "ConspiracyTheory"

    df_reddit = pd.concat([df_gw_clean, df_ct_clean], axis = 0, ignore_index=True)

    #Lots of cleaning on text

    def text_cleaning(item):

        #Removing "\n" characters
        item = re.sub("\n", " ", item)
        #Removing the [removed] characters
        item = item.replace("[removed]", " ")
        # Use regular expressions to do a find-and-replace
        item = re.sub("[^a-zA-Z]", " ", item)
        #Making all characters lower case
        item = item.lower()
        #Replacing multiple spaces
        item = " ".join(item.split())
        #Removing stopwords
        stops = stopwords.words("english")
        words = [w for w in item.split() if w not in stops]#stops
        # Instantiate object of class PorterStemmer and stemming.
        p_stemmer = PorterStemmer()
        words = [p_stemmer.stem(i) for i in words]
        # Adding space to stitch the words together
        words = " ".join(list(words)) 

        return words

    df_reddit["text_merged"] = df_reddit["text_merged"].apply(text_cleaning)

    #Stemming

    df_reddit.reset_index(drop=True, inplace=True)

    # Pickling the dataframe as they are large!

    pickle.dump(df_reddit, open('../datasets/df_reddit.pkl', 'wb'))

# Step 3: Transform features and Train/Test split

In [8]:
def train_test_nlp_process():    
    #imports
    import numpy as np
    import pandas as pd
    import matplotlib.pylab as plt
    import seaborn as sns
    import eli5

    import regex as re
    import nltk
    #nltk.download('vader_lexicon')
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.neural_network import MLPClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.svm import SVC
    from sklearn.metrics import confusion_matrix, plot_confusion_matrix
    from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


    from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

    from nltk.corpus import stopwords

    import pickle


    import warnings
    warnings.filterwarnings('ignore')
    warnings.simplefilter(action='ignore', category=FutureWarning)

    ################# Pulling cleaned data

    df_reddit = pickle.load(open('../datasets/df_reddit.pkl', 'rb'))

    ####Defining the target variable
    df_reddit['target'] = df_reddit['subreddit'].replace({"GlobalWarming": 1, "ConspiracyTheory": 0})

    ############# Bootstrapped the under-represented class to balance the classes:
    n_bts_sample = df_reddit[(df_reddit["subreddit"]=="GlobalWarming")].shape[0] - df_reddit[(df_reddit["subreddit"]=="ConspiracyTheory")].shape[0]

    df_btsp = df_reddit[(df_reddit["subreddit"]=="ConspiracyTheory")].sample(n = n_bts_sample, replace=True, random_state=101)

    df_reddit_btsp = pd.concat([df_reddit, df_btsp])

    df_reddit_btsp.reset_index(drop=True, inplace=True)


    ################### Defining X and Y variables:
    X = df_reddit_btsp['text_merged']
    y = df_reddit_btsp['target']

    #################### Train/test split:
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size = 0.25,
                                                        stratify=y,
                                                        random_state = 101)

    ############################ Vectorizing a text using countVectorizer()
    my_stop_words = ["dec", "global", "http", "www", "com", "conspiraci", "warm", "climat", "remov", "theori", "theactualshadow", "co"]
    list_stop_words = nltk.corpus.stopwords.words('english')
    list_stop_words.extend(my_stop_words)

    vectorizer = CountVectorizer(analyzer = "word",
                                 #ngram_range=(1,2),
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = list_stop_words,
                                 max_features = 4000) 


    X_train = vectorizer.fit_transform(X_train)

    X_test = vectorizer.transform(X_test)


    return X_train, X_test, y_train, y_test


def rf_score_on_test_dataset(rf, X_test, y_test):
    
    from sklearn.metrics import confusion_matrix, plot_confusion_matrix
    from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
    
    preds = rf.predict(X_test)

    # Save confusion matrix values
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()


    Accuracy = round((tp + tn) / (tp + tn + fp + fn), 3)
    Precision = round(tp / (tp + fp), 3)
    Recall = round(tp / (tp + fn), 3)

    return Accuracy, Precision, Recall
    


# ml_retrain_registry_random_forest_subreddit()

In [14]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)


def ml_retrain_registry_random_forest_subreddit(n_estimators, max_depth, min_samples_split):
    
    #Step 1: import subreddit data
    import_data_subreddit(latest_date = '1 July, 2020')
    
    #Step 2: clean the dataset (major cleaning are on NLP end)
    clean_data_subreddit()
    
    #Step 3: Transform features and Train/Test split
    X_train, X_test, y_train, y_test = train_test_nlp_process()
    
    #Step 4: ML build and score
    criterion='gini'
    rf = RandomForestClassifier(n_estimators, criterion, max_depth, min_samples_split)
    
    rf.fit(X_train, y_train)
    
    Accuracy, Precision, Recall = rf_score_on_test_dataset(rf, X_test, y_test)
    
    #Step 5: Register the parameters and metric on MLflow
    
    with mlflow.start_run():

        print("randomForest model (n_estimators=%s, max_depth=%s, min_samples_split=%s):" % (n_estimators, max_depth, min_samples_split))
        print("  Accuracy: %s" % Accuracy)
        print("  Precision: %s" % Precision)
        print("  Recall: %s" % Recall)

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_metric("min_samples_split", min_samples_split)
        mlflow.log_metric("Accuracy", Accuracy)
        mlflow.log_metric("Precision", Precision)
        mlflow.log_metric("Recall", Recall)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            mlflow.sklearn.log_model(rf, "model", registered_model_name="randomForestSubReddit")
        else:
            mlflow.sklearn.log_model(rf, "model")
            
    
    
    
n_estimators = 200
max_depth = None
min_samples_split = 2

ml_retrain_registry_random_forest_subreddit(n_estimators, max_depth, min_samples_split)


randomForest model (n_estimators=200, max_depth=None, min_samples_split=2):
  Accuracy: 0.968
  Precision: 0.987
  Recall: 0.947


# Run the MLflow UI

In [None]:
!mlflow ui