# Modeling



- https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
- https://stackoverflow.com/questions/19377969/combine-two-columns-of-text-in-dataframe-in-pandas-python
- https://www.geeksforgeeks.org/removing-stop-words-nltk-python/


## Imports

In [143]:
import pandas as pd
import numpy as np
import os
import regex as re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sqlalchemy import create_engine
import psycopg2
from pandas.io import sql

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## Establish functions

In [117]:
# Create a function to scrub a text string
def scrub_text(in_text):
    # Instantiate lemmatizer
    lemma = WordNetLemmatizer()
    # Expanded stop words
    ext_stop = stopwords.words('english')
    ext_stop.extend(['im','ive','dont','hes','got'])
    # Remove punctuation and lower case words
    word_list = re.sub(r'[^a-zA-Z ]','',in_text.lower()).split()
#    word_list = [lemma.lemmatize(i) for i in word_list]
    word_list = [i for i in word_list if i not in ext_stop]
    return ' '.join(word_list)


# Read in data from SQL database
def read_from_database(SQL, engine = None):
    # write posts to the posts table
    if engine == None:
        return
    return pd.read_sql(SQL, con = engine)


# Clean text columns
def clean_columns(df, features = [], subset = None):
    # For each feature in column scrub the text
    # and create a new column with the cleaned data
    for feature in features:
        try:
            df[feature] = df[feature].apply(scrub_text)
        except:
            pass
    
    # drop the duplicates using the subset column as the key
    df.drop_duplicates(subset = subset, inplace=True)
    

# Read data from database and output model-ready dataframe
def create_model_df(engine = None, y = 'subreddit', X_columns = [], subset = None,
                    subreddit_1 = None, subreddit_2 = None):
    
    # Build the SQL
    SQL =  "SELECT " + y + ", " + ", ".join(X_columns) + " FROM posts WHERE "
    SQL = SQL + y + " = \'" + str(subreddit_1) + "\'" + " OR " + y + " = \'" + str(subreddit_2) + "\'"
    
    # Read this data from the database
    df = read_from_database(SQL, engine = engine)
    
    # Clean the columns and drop duplicates
    clean_columns(df, features = X_columns, subset = subset)
    
    # Merge the X_columns into a single text column
    df["_".join(X_columns)] = df[X_columns].apply(lambda x: ' '.join(x), axis = 1)
        
    df.drop(columns = X_columns, inplace = True)
    
    # Binarize the surreddit variables
    df[y] = df[y].map({subreddit_1: 1, subreddit_2: 0})
    
    return df


# Classification models
def class_model(df, x_col = None, y_col = None, vectorizer = 'cvec', model = 'bayem',
                random_state = 42):
    # Assign X and y variables
    X = df[x_col]
    y = df[y_col]
    
    # Train test split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = random_state)
    
    # Set up the Pipeline
    if vectorizer == 'cvec' and model == 'logistic':
        pipe = Pipeline([('vec', CountVectorizer()),
                         ('mod', LogisticRegression())
                        ])
    elif vectorizer == 'tfidf' and model == 'logistic':
        pipe = Pipeline([('vec', TfidfVectorizer()),
                         ('mod', LogisticRegression())
                        ])
    elif vectorizer == 'cvec' and model == 'bayem': 
        pipe = Pipeline([('vec', CountVectorizer()),
                         ('mod', MultinomialNB())
                        ])
    elif vectorizer == 'tfidf' and model == 'bayem':
        pipe = Pipeline([('vec', TfidfVectorizer()),
                         ('mod', MultinomialNB())
                        ])

#     pipe_params = {'vec__stop_words': ['english'],
#                     'vec__max_features': [None, 500, 1000],
#                     'vec__min_df': [0.0],
#                     'vec__max_df': [1.0],
#                     'vec__ngram_range': [(1,1), (1,2)]
#                     }
    pipe_params = {'vec__max_features': [None, 500, 1000],
                    'vec__min_df': [0.0],
                    'vec__max_df': [1.0],
                    'vec__ngram_range': [(1,1), (1,2)]
                    }
      
    
    # Set up the grid search
    mod_out = {}
    gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, cv = 3)
    gs.fit(X_train, y_train)
    mod_out['train_score'] = gs.score(X_train, y_train)
    mod_out['test_score'] = gs.score(X_test, y_test)
    mod_out['y_test'] = y_test
    mod_out['y_train'] = y_train
    mod_out['y'] = y
    mod_out['pred'] = gs.predict(X_test)
    mod_out['proba'] = gs.predict_proba(X_test)
    mod_out['best_param'] = gs.best_params_
    mod_out['vocabulary'] = gs.best_estimator_.named_steps['vec'].vocabulary_
    mod_out['features'] = gs.best_estimator_.named_steps['vec'].get_feature_names()
    mod_out['vector_matrix'] = gs.best_estimator_.named_steps['vec'].transform(X)
    
    return mod_out


# Read data from database and output model-ready dataframe
def data_overview(engine = None, y = 'subreddit', X_columns = [], subset = None,
                    subreddit_1 = None, subreddit_2 = None):
    
    # Build the SQL
    SQL =  "SELECT * FROM posts " 
    
    # Read this data from the database
    df = read_from_database(SQL, engine = engine)
    
    for col in X_columns
    
    


## Establish parameters

In [118]:
# Database engine
engine = create_engine('postgres://postgres:pass@34.222.13.94:5432')


In [136]:
# Build the SQL
SQL =  "SELECT * FROM posts " 
    
# Read this data from the database
df_a = read_from_database(SQL, engine = engine)
df_a.head()

Unnamed: 0,author_fullname,comments,created_utc,id,name,num_comments,num_comments_cap,permalink,selftext,subreddit,title
0,t2_3l4ha,My fiancé and I just found out we’re having a ...,1553870000.0,b6y1o8,t3_b6y1o8,29,22,/r/relationships/comments/b6y1o8/its_love_fest...,Time to share your happy stories with us. We r...,relationships,It's Love Fest Friday!
1,t2_3i9a63ve,"honestly, i thought this was going somewhere c...",1553923000.0,b77few,t3_b77few,60,39,/r/relationships/comments/b77few/me_20m_with_m...,So this is gonna be weird all around because t...,relationships,Me [20M] with my father [40M] who I met months...
2,t2_37ml46ds,&gt;He also went overdrawn because of this and...,1553891000.0,b7273a,t3_b7273a,101,59,/r/relationships/comments/b7273a/update_bf_31m...,"UPDATE from previous post, original link here:...",relationships,UPDATE: bf (31m) still using webcam girls desp...
3,t2_tdy0zf1,Break up. This does not get better. His behavi...,1553914000.0,b769id,t3_b769id,69,49,/r/relationships/comments/b769id/boyfriend_21m...,"throwaway because he knows my reddit account, ...",relationships,Boyfriend [21M] is constantly stalking me [20F...
4,t2_1m5pvmux,"Honestly, you won’t be enough for her until yo...",1553924000.0,b77mho,t3_b77mho,18,15,/r/relationships/comments/b77mho/i_25m_have_an...,"I get the ""then that means she isnt right for ...",relationships,I (25M) have an inferiority complex and cannot...


In [173]:

sub_groups = ['relationships', 'diy','politics', 'woodworking']

df_a['word_count'] = df_a['selftext'].apply(lambda x: len(x.split()))
df_a['char_count'] = df_a['selftext'].apply(lambda x: len(x))

avgs = df_a.groupby('subreddit').mean().unstack()
avgs['word_count']['DIY']

# fig, ax = plt.subplots(figsize = (20,10))
# df_a[df_a['subreddit']=='relationships']
# for sub in sub_groups:
#     sns.distplot(df_a[df_a['subreddit']==sub]['word_count'], ax = ax, kde=False)
    



97.76923076923077

## Read in data from the database, clean data and binarize the reddit column

In [125]:
# Create the modeling data
# X_columns = ['title', 'selftext', 'comments']
# subreddit_1 = 'relationships', subreddit_2 = 'DIY'
# sub_groups = ['relationships', 'diy','politics', 'woodworking']

df = create_model_df(engine = engine, y = 'subreddit', X_columns = ['title', 'selftext'], 
                subset = ['selftext'], subreddit_1 = 'woodworking', subreddit_2 = 'DIY')

In [126]:
# Examine the dependent variable
df.subreddit.value_counts(normalize = True)


1    0.56621
0    0.43379
Name: subreddit, dtype: float64

In [127]:
df.head()

Unnamed: 0,subreddit,title_selftext
0,0,temporarily disable demolished shed electrics ...
1,0,refinished free table
2,0,sharkbite fittings restrict water pressure ins...
3,0,general feedbackgetting started questions answ...
8,0,polyurethane scratching refinished table seale...


## Compare models


In [132]:
# Create a classification model with these parameters
m1 = class_model(df, x_col = 'title_selftext', y_col = 'subreddit', vectorizer = 'cvec', 
            model = 'bayem', random_state = 42)


In [134]:

subreddit_1 = 'woodworking'
subreddit_2 = 'DIY'
m1_df = pd.DataFrame(m1['vector_matrix'].toarray(), columns = m1['features'])
m1_df.insert(0, 'subreddit_class', m1['y'])
m1_df['subreddit_class'] = m1_df['subreddit_class'].map({1:subreddit_1, 0:subreddit_2})
print("train score ", m1['train_score'])
print("test score ", m1['test_score'])



m1_df.groupby('subreddit_class').mean().T.sort_values(by = subreddit_1, ascending = False)




train score  1.0
test score  0.8363636363636363


subreddit_class,DIY,woodworking
like,0.449664,0.666667
would,0.657718,0.595238
wood,0.456376,0.523810
table,0.322148,0.500000
get,0.328859,0.357143
film,0.006711,0.357143
need,0.315436,0.333333
advice,0.187919,0.285714
good,0.208054,0.285714
find,0.067114,0.285714


In [None]:
X = df['title_selftext']
y = df['subreddit']



## Build a grid search pipeline