In [11]:
# Import some common packages
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

In [12]:
# Load data
data = pd.read_json('../data/relationship_advice.json')
data['distinguished'] = data['distinguished'].fillna(value='none')

In [19]:
# Functions for categorization criteria (see `playground.ipynb` for detailed explanation)

# compute boundaries
def compute_bounds(data):
    res  = dict()
    P_90 = np.percentile(data,90)
    # excluding forall d <= 0
    log_data = np.log([d for d in data if d > P_90])
    res['INSIGNIFICANT'] = np.exp(np.percentile(log_data, 0)) # equivalent to np.percentile(data, 90)
    res['NOTABLE']       = np.exp(np.percentile(log_data, 60))
    res['SIGNIFICANT']   = np.exp(np.percentile(log_data, 90))
    res['POPULAR']       = np.exp(np.percentile(log_data, 99.0))
    # implicit 'VERY POPULAR'
    return res
    
# autocategorization
def categorize(d, bounds):
    # try all categories
    for cat, val in bounds.items():
        if d < val:
            return cat
    # if at this point, VERY POPULAR
    return 'VERY POPULAR'

In [20]:
# Clean data and generate features
import afinn

afinn_nlp = afinn.Afinn(language='en', emoticons=True)
data['sentiment'] = data['body'].apply(lambda text: afinn_nlp.score(text))

data['word_count'] = data['body'].apply(lambda text: len(text.split(' ')))
data['char_count'] = data['body'].apply(lambda text: len(text))

# categorize using defined boundaries (lambda now accepts bounds param, which is a dict)
bounds = compute_bounds(data['score'])
data['score_cat'] = data['score'].apply(lambda score: categorize(score, bounds))


### DEPRECATED
# def categorize(score):
#     if score < 0:
#         return 'UNPOPULAR'
#     elif score >= 0 and score < 100:
#         return 'UNREMARKABLE'
#     elif score >= 100 and score < 500:
#         return 'SOMEWHAT POPULAR'
#     elif score >= 500 and score < 1000:
#         return 'POPULAR'
#     else:
#         return 'VERY POPULAR'
# data['score_cat'] = data['score'].apply(lambda score: categorize(score))

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english', sublinear_tf=True)
train, test = train_test_split(data, test_size = 0.2)

tr_scores, tr_comments, tr_meta = train['score_cat'], train['body'], train.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])
tst_scores, tst_comments, tst_meta = test['score_cat'], test['body'], test.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])

In [22]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

features = data.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), list(features.select_dtypes(include=[np.number]))),
    ('cat', OneHotEncoder(), list(features.select_dtypes(include=[np.object]).columns))
])

In [23]:
x_tr_comments = tf.fit_transform(tr_comments)
x_tst_comments = tf.transform(tst_comments)

x_tr_meta = full_pipeline.fit_transform(tr_meta)
x_tst_meta = full_pipeline.transform(tst_meta)

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dt_comm = DecisionTreeClassifier()
dt_comm.fit(x_tr_comments, tr_scores)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [25]:
comm_pred = dt_comm.predict(x_tst_comments)
print("Accuracy of the DT  based on TFIDF: ", accuracy_score(tst_scores, comm_pred))
print("Precision of the DT based on TFIDF: ", precision_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the DT based on TFIDF: ", recall_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the DT based on TFIDF: ", f1_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the DT  based on TFIDF:  0.849769238014505
Precision of the DT based on TFIDF:  0.8268327829906992
Recall of the DT based on TFIDF:  0.849769238014505
F-Score of the DT based on TFIDF:  0.8379446618785816


In [26]:
dt_meta = DecisionTreeClassifier()
dt_meta.fit(x_tr_meta, tr_scores)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [27]:
meta_pred = dt_meta.predict(x_tst_meta)
print("Accuracy of the DT based on metadata: ", accuracy_score(tst_scores, meta_pred))
print("Precision of the DT based on metadata: ", precision_score(tst_scores, meta_pred, p)
print("Recall of the DT based on metadata: ", recall_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the DT based on metadata: ", f1_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the DT based on metadata:  0.8304605820853348
Precision of the DT based on metadata:  0.8424793732899176
Recall of the DT based on metadata:  0.8304605820853348
F-Score of the DT based on metadata:  0.8363510942323107


#### Try using grid search on decision tree for optimizing hyperparams

In [28]:
from sklearn.model_selection import GridSearchCV, cross_val_score

# choose params to try + optimize
param_grid = {
    'max_depth': [2, 4, 6, 8],
    'criterion': ['gini', 'entropy']
}
# classifier and grid search definition
dt_comm_opt = DecisionTreeClassifier()
dt_meta_opt = DecisionTreeClassifier()
grid_comm = GridSearchCV(dt_comm_opt, param_grid)
grid_meta = GridSearchCV(dt_meta_opt, param_grid)

In [29]:
# fit and search
grid_comm.fit(x_tr_comments, tr_scores)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                

In [31]:
# fit and search
grid_meta.fit(x_tr_meta, tr_scores)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                

In [30]:
# basic info
print('Results for comm')
print('Best max_depth:', grid_comm.best_estimator_.get_params()['max_depth'])
print('Best criterion:', grid_comm.best_estimator_.get_params()['criterion'])

# cross validation
results = cross_val_score(grid_comm, x_tr_comments, tr_scores)
print(results)

Results for comm
Best max_depth: 2
Best criterion: entropy
[0.90169531 0.90169531 0.90193077 0.90168374 0.90180148]


In [32]:
# basic info
print('Results for meta')
print('Best max_depth:', grid_meta.best_estimator_.get_params()['max_depth'])
print('Best criterion:', grid_meta.best_estimator_.get_params()['criterion'])

# cross validation
results = cross_val_score(grid_meta, x_tr_meta, tr_scores)
print(results)

Results for meta
Best max_depth: 4
Best criterion: gini
[0.9020485  0.90134212 0.90216623 0.90203697 0.90180148]


### Adaboost

In [33]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

ada_comm = AdaBoostClassifier()
ada_comm.fit(x_tr_comments, tr_scores)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [34]:
ada_comm_pred = ada_comm.predict(x_tst_comments)
print("Accuracy of the Adaboost based on TFIDF: ", accuracy_score(tst_scores, ada_comm_pred))
print("Precision of the Adaboost based on TFIDF: ", precision_score(tst_scores, ada_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the Adaboost based on TFIDF: ", recall_score(tst_scores, ada_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the Adaboost based on TFIDF: ", f1_score(tst_scores, ada_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the Adaboost based on TFIDF:  0.9023264575680512
Precision of the Adaboost based on TFIDF:  0.8152404733503618
Recall of the Adaboost based on TFIDF:  0.9023264575680512
F-Score of the Adaboost based on TFIDF:  0.8565757003611943


  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
ada_meta = AdaBoostClassifier()
ada_meta.fit(x_tr_meta, tr_scores)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [36]:
ada_meta_pred = ada_meta.predict(x_tst_meta)
print("Accuracy of the Adaboost based on metadata: ", accuracy_score(tst_scores, ada_meta_pred))
print("Precision of the Adaboost based on metadata: ", precision_score(tst_scores, ada_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the Adaboost based on metadata: ", recall_score(tst_scores, ada_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the Adaboost based on metadata: ", f1_score(tst_scores, ada_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the Adaboost based on metadata:  0.9024206461335594
Precision of the Adaboost based on metadata:  0.8280282360702597
Recall of the Adaboost based on metadata:  0.9024206461335594
F-Score of the Adaboost based on metadata:  0.8587000878693295


### Gradient Boosted

In [37]:
gb_comm = GradientBoostingClassifier()
gb_comm.fit(x_tr_comments, tr_scores)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [38]:
gb_comm_pred = gb_comm.predict(x_tst_comments)
print("Accuracy of the GB based on TFIDF: ", accuracy_score(tst_scores, gb_comm_pred))
print("Precision of the GB based on TFIDF: ", precision_score(tst_scores, gb_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the GB based on TFIDF: ", recall_score(tst_scores, gb_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the GB based on TFIDF: ", f1_score(tst_scores, gb_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the GB based on TFIDF:  0.897711217858152
Precision of the GB based on TFIDF:  0.8155344000160742
Recall of the GB based on TFIDF:  0.897711217858152
F-Score of the GB based on TFIDF:  0.8546376137659321


In [39]:
gb_meta = GradientBoostingClassifier()
gb_meta.fit(x_tr_meta, tr_scores)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [40]:
gb_meta_pred = gb_meta.predict(x_tst_meta)
print("Accuracy of the GB based on metadata: ", accuracy_score(tst_scores, gb_meta_pred))
print("Precision of the GB based on metadata: ", precision_score(tst_scores, gb_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the GB based on metadata: ", recall_score(tst_scores, gb_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the GB based on metadata: ", f1_score(tst_scores, gb_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the GB based on metadata:  0.9033625317886409
Precision of the GB based on metadata:  0.8575059756424817
Recall of the GB based on metadata:  0.9033625317886409
F-Score of the GB based on metadata:  0.8585857924605989


### Random Forest

In [41]:
rf_comm = RandomForestClassifier()
rf_comm.fit(x_tr_comments, tr_scores)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
rf_comm_pred = rf_comm.predict(x_tst_comments)
print("Accuracy of the RF based on TFIDF: ", accuracy_score(tst_scores, rf_comm_pred))
print("Precision of the RF based on TFIDF: ", precision_score(tst_scores, rf_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the RF based on TFIDF: ", recall_score(tst_scores, rf_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the RF based on TFIDF: ", f1_score(tst_scores, rf_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the RF based on TFIDF:  0.8956390694169728
Precision of the RF based on TFIDF:  0.8205424858590274
Recall of the RF based on TFIDF:  0.8956390694169728
F-Score of the RF based on TFIDF:  0.854316831211526


In [43]:
rf_meta = RandomForestClassifier()
rf_meta.fit(x_tr_meta, tr_scores)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [44]:
rf_meta_pred = rf_meta.predict(x_tst_meta)
print("Accuracy of the RF based on metadata: ", accuracy_score(tst_scores, rf_meta_pred))
print("Precision of the RF based on metadata: ", precision_score(tst_scores, rf_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the RF based on metadata: ", recall_score(tst_scores, rf_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the RF based on metadata: ", f1_score(tst_scores, rf_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the RF based on metadata:  0.8986531035132335
Precision of the RF based on metadata:  0.8303348237541501
Recall of the RF based on metadata:  0.8986531035132335
F-Score of the RF based on metadata:  0.8587995663731227
