In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, \
                            accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
                             GradientBoostingClassifier, AdaBoostClassifier, \
                             VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [16]:
df = pd.read_csv('./datasets/final_df')

In [17]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [18]:
y = df['above_median_comments']
y.value_counts(normalize=True)

0    0.510368
1    0.489632
Name: above_median_comments, dtype: float64

In [19]:
df.head()

Unnamed: 0,subreddit,selftext,gilded,title,link_flair_css_class,upvote_ratio,total_awards_received,domain,allow_live_comments,author_flair_text,...,created_utc,above_median_comments,date,clean_title,clean_selftext,clean_title_flair,clean_domain,clean_author_flair,clean_author,combi_text
0,nba,"90s Bulls\n\n- Kevin Johnson, MJ, Pippen, Rodm...",0,90s Bulls with Prime Mutombo and Rodman vs 90s...,,0.36,0,self.nba,True,,...,1594683349,1,2020-07-13 23:35:49,s bull prime mutombo rodman v s rocket prime l...,s bull kevin johnson mj pippen rodman mutomb...,,selfnba,,theunknownsoldier,s bull prime mutombo rodman v s rocket prime l...
1,nba,[deleted],0,Lebron vs MJ Great Article,,1.0,0,,False,,...,1594683301,0,2020-07-13 23:35:01,lebron v mj great article,deleted,,,,,lebron v mj great article deleted
2,nba,It’s common to just assume the Bucks will take...,0,Are we underrating the Eastern Conference play...,,0.59,0,self.nba,False,:okc-1: Thunder,...,1594683242,1,2020-07-13 23:34:02,underrating eastern conference playoff race,common just assume buck east im argue shouldnt...,,selfnba,okc thunder,vincemcmahonsburner,underrating eastern conference playoff race co...
3,nba,[deleted],0,Who is da GOAT,,0.24,0,,False,,...,1594683142,1,2020-07-13 23:32:22,da goat,deleted,,,,,da goat deleted
4,nba,[deleted],0,"Kawhi at practice in Orlando, Today!",,1.0,0,,False,,...,1594683115,0,2020-07-13 23:31:55,kawhi practice orlando today,deleted,,,,,kawhi practice orlando today deleted


# Train Test Split

In [112]:
vectorizers = {'cvec': CountVectorizer(),
               'tvec': TfidfVectorizer()}

In [113]:
models = {'lr': LogisticRegression(max_iter=1_000, random_state=42),
          'rf': RandomForestClassifier(random_state=42)}

In [114]:
def run_model(vec, mod, vec_params={}, mod_params={}, grid_search=False):
    
    results = {}
    
    pipe = Pipeline([
            (vec, vectorizers[vec]),
            (mod, models[mod])
            ])
    pipe.fit(X_train, y_train)
    
    # Retrieve metrics
    results['model'] = mod
    results['vectorizer'] = vec
    results['train'] = pipe.score(X_train, y_train)
    results['test'] = pipe.score(X_test, y_test)
    predictions = pipe.predict(X_test)
    
    print('METRICS')
    display(results)
    
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    
    return pipe

### Combi_text Logistic Regression

In [115]:
X = df['combi_text']
y = df['above_median_comments']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [116]:
cvec_lr = run_model('cvec', 'lr')

METRICS


{'model': 'lr',
 'vectorizer': 'cvec',
 'train': 0.8983464353483329,
 'test': 0.7235772357723578}

True Negatives: 995
False Positives: 261
False Negatives: 419
True Positives: 785


In [117]:
tvec_lr = run_model('tvec', 'lr')

METRICS


{'model': 'lr',
 'vectorizer': 'tvec',
 'train': 0.8311195445920304,
 'test': 0.7317073170731707}

True Negatives: 938
False Positives: 318
False Negatives: 342
True Positives: 862


### Combi_text Random Forest Classifier

In [118]:
cvec_rf = run_model('cvec', 'rf')

METRICS


{'model': 'rf',
 'vectorizer': 'cvec',
 'train': 0.9829222011385199,
 'test': 0.7483739837398374}

True Negatives: 1076
False Positives: 180
False Negatives: 439
True Positives: 765


In [119]:
tvec_rf = run_model('tvec', 'lr')

METRICS


{'model': 'lr',
 'vectorizer': 'tvec',
 'train': 0.8311195445920304,
 'test': 0.7317073170731707}

True Negatives: 938
False Positives: 318
False Negatives: 342
True Positives: 862


### Domain Logistic Regression

In [120]:
df['clean_domain'] = df['clean_domain'].fillna('None')

In [121]:
X = df['clean_domain']
y = df['above_median_comments']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [122]:
run_model('cvec', 'lr')

METRICS


{'model': 'lr',
 'vectorizer': 'cvec',
 'train': 0.6989699105448631,
 'test': 0.6959349593495935}

True Negatives: 883
False Positives: 373
False Negatives: 375
True Positives: 829


Pipeline(steps=[('cvec', CountVectorizer()),
                ('lr', LogisticRegression(max_iter=1000, random_state=42))])

In [123]:
run_model('tvec', 'lr')

METRICS


{'model': 'lr',
 'vectorizer': 'tvec',
 'train': 0.6989699105448631,
 'test': 0.6959349593495935}

True Negatives: 883
False Positives: 373
False Negatives: 375
True Positives: 829


Pipeline(steps=[('tvec', TfidfVectorizer()),
                ('lr', LogisticRegression(max_iter=1000, random_state=42))])

### Domain Random Forest Classifier

In [124]:
run_model('cvec', 'rf')

METRICS


{'model': 'rf',
 'vectorizer': 'cvec',
 'train': 0.7069666576307942,
 'test': 0.6967479674796748}

True Negatives: 879
False Positives: 377
False Negatives: 369
True Positives: 835


Pipeline(steps=[('cvec', CountVectorizer()),
                ('rf', RandomForestClassifier(random_state=42))])

In [125]:
run_model('tvec', 'rf')

METRICS


{'model': 'rf',
 'vectorizer': 'tvec',
 'train': 0.7069666576307942,
 'test': 0.6967479674796748}

True Negatives: 879
False Positives: 377
False Negatives: 369
True Positives: 835


Pipeline(steps=[('tvec', TfidfVectorizer()),
                ('rf', RandomForestClassifier(random_state=42))])

## Conclusion & Recommendations

Comparing the two classification models, we can see that the best model is the Random Forest Classifier with an accuracy of 0.7483739837398374.

Some of the limitations to our model would be that the number of comments is severely impacted on whether or not the NBA is in season, especially come playoff time. There is a more dedicated fanbase during playoffs that posts discussions of each game that is more indepth and may contain a different type of language compared to the off-season, where there are more meme-type posts. There are also some misclassification on whether or not a post has above the mean number of comments, which in this case was 9. Lastly, due to some of the subreddit rules, there may be heavy bias in title structure, which may have affected our overall outcome.

My Recommendations to Nate Silver and co. at FiveThirtyEight to create a Reddit post that will get the most engagement from Reddit users are to use words that are highly associated with r/NBA including names. Some names included Kevin Durant, Michael Jordan, and Lebron James. By utilizing their popularity in the post, it would have a better effect in attracting engagement from users. Highlights also seemed to have high usage in titles, partly due to the subreddit posting rules. With video clips of  highlights of various games throughout the week, users may be able to catch up to the games they were unable to see live, increasing the engagment in the post.