In [1]:
# Import some common packages
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Load data
data = pd.read_json('../data/relationship_advice.json')
data['distinguished'] = data['distinguished'].fillna(value='none')

In [3]:
# Clean data and generate features
import afinn

afinn_nlp = afinn.Afinn(language='en', emoticons=True)
data['sentiment'] = data['body'].apply(lambda text: afinn_nlp.score(text))

data['word_count'] = data['body'].apply(lambda text: len(text.split(' ')))
data['char_count'] = data['body'].apply(lambda text: len(text))

def categorize(score):
    if score < 0:
        return 'UNPOPULAR'
    elif score >= 0 and score < 100:
        return 'UNREMARKABLE'
    elif score >= 100 and score < 500:
        return 'SOMEWHAT POPULAR'
    elif score >= 500 and score < 1000:
        return 'POPULAR'
    else:
        return 'VERY POPULAR'
data['score_cat'] = data['score'].apply(lambda score: categorize(score))

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english', sublinear_tf=True)
train, test = train_test_split(data, test_size = 0.2)

tr_scores, tr_comments, tr_meta = train['score_cat'], train['body'], train.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])
tst_scores, tst_comments, tst_meta = test['score_cat'], test['body'], test.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

features = data.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), list(features.select_dtypes(include=[np.number]))),
    ('cat', OneHotEncoder(), list(features.select_dtypes(include=[np.object]).columns))
])

In [6]:
x_tr_comments = tf.fit_transform(tr_comments)
x_tst_comments = tf.transform(tst_comments)

x_tr_meta = full_pipeline.fit_transform(tr_meta)
x_tst_meta = full_pipeline.transform(tst_meta)

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dt_comm = DecisionTreeClassifier()
dt_comm.fit(x_tr_comments, tr_scores)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [8]:
comm_pred = dt_comm.predict(x_tst_comments)
print("Accuracy of the DT  based on TFIDF: ", accuracy_score(tst_scores, comm_pred))
print("Precision of the DT based on TFIDF: ", precision_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the DT based on TFIDF: ", recall_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the DT based on TFIDF: ", f1_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the DT  based on TFIDF:  0.8374508702975856
Precision of the DT based on TFIDF:  0.8170226489246823
Recall of the DT based on TFIDF:  0.8374508702975856
F-Score of the DT based on TFIDF:  0.8268974241536987


In [9]:
dt_meta = DecisionTreeClassifier()
dt_meta.fit(x_tr_meta, tr_scores)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [10]:
meta_pred = dt_meta.predict(x_tst_meta)
print("Accuracy of the DT based on metadata: ", accuracy_score(tst_scores, meta_pred))
print("Precision of the DT based on metadata: ", precision_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the DT based on metadata: ", recall_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the DT based on metadata: ", f1_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the DT based on metadata:  0.8090960134755755
Precision of the DT based on metadata:  0.8231030528743398
Recall of the DT based on metadata:  0.8090960134755755
F-Score of the DT based on metadata:  0.8159239255261288


#### Try using grid search on decision tree for optimizing hyperparams

In [11]:
from sklearn.model_selection import GridSearchCV, cross_val_score

# choose params to try + optimize
param_grid = {
    'max_depth': [2, 4, 6, 8],
    'criterion': ['gini', 'entropy']
}
# classifier and grid search definition
dt_comm_opt = DecisionTreeClassifier()
dt_meta_opt = DecisionTreeClassifier()
grid_comm = GridSearchCV(dt_comm_opt, param_grid)
grid_meta = GridSearchCV(dt_meta_opt, param_grid)

In [12]:
# fit and search
grid_comm.fit(x_tr_comments, tr_scores)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 4, 6, 8]},
             pre_di

In [13]:
# fit and search
grid_meta.fit(x_tr_meta, tr_scores)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 4, 6, 8]},
             pre_di

In [14]:
# basic info
print('Results for comm')
print('Best max_depth:', grid_comm.best_estimator_.get_params()['max_depth'])
print('Best criterion:', grid_comm.best_estimator_.get_params()['criterion'])

# cross validation
results = cross_val_score(grid_comm, x_tr_comments, tr_scores)
print(results)

Results for comm
Best max_depth: 2
Best criterion: entropy




[0.89108008 0.89065132 0.89133029]


In [15]:
# basic info
print('Results for meta')
print('Best max_depth:', grid_meta.best_estimator_.get_params()['max_depth'])
print('Best criterion:', grid_meta.best_estimator_.get_params()['criterion'])

# cross validation
results = cross_val_score(grid_meta, x_tr_meta, tr_scores)
print(results)

Results for meta
Best max_depth: 6
Best criterion: entropy




[0.89115026 0.89142336 0.89111969]


### Adaboost

In [16]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

ada_comm = AdaBoostClassifier()
ada_comm.fit(x_tr_comments, tr_scores)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [22]:
ada_comm_pred = ada_comm.predict(x_tst_comments)
print("Accuracy of the Adaboost based on TFIDF: ", accuracy_score(tst_scores, ada_comm_pred))
print("Precision of the Adaboost based on TFIDF: ", precision_score(tst_scores, ada_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the Adaboost based on TFIDF: ", recall_score(tst_scores, ada_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the Adaboost based on TFIDF: ", f1_score(tst_scores, ada_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the Adaboost based on TFIDF:  0.8949092270260154
Precision of the Adaboost based on TFIDF:  0.8450851309941906
Recall of the Adaboost based on TFIDF:  0.8949092270260154
F-Score of the Adaboost based on TFIDF:  0.8503194314168256


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [18]:
ada_meta = AdaBoostClassifier()
ada_meta.fit(x_tr_meta, tr_scores)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [23]:
ada_meta_pred = ada_meta.predict(x_tst_meta)
print("Accuracy of the Adaboost based on metadata: ", accuracy_score(tst_scores, ada_meta_pred))
print("Precision of the Adaboost based on metadata: ", precision_score(tst_scores, ada_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the Adaboost based on metadata: ", recall_score(tst_scores, ada_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the Adaboost based on metadata: ", f1_score(tst_scores, ada_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the Adaboost based on metadata:  0.8938798427849522
Precision of the Adaboost based on metadata:  0.8331848986224639
Recall of the Adaboost based on metadata:  0.8938798427849522
F-Score of the Adaboost based on metadata:  0.8455779729173174


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Gradient Boosted

In [21]:
gb_comm = GradientBoostingClassifier()
gb_comm.fit(x_tr_comments, tr_scores)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [24]:
gb_comm_pred = gb_comm.predict(x_tst_comments)
print("Accuracy of the GB based on TFIDF: ", accuracy_score(tst_scores, gb_comm_pred))
print("Precision of the GB based on TFIDF: ", precision_score(tst_scores, gb_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the GB based on TFIDF: ", recall_score(tst_scores, gb_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the GB based on TFIDF: ", f1_score(tst_scores, gb_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the GB based on TFIDF:  0.8886393411940857
Precision of the GB based on TFIDF:  0.8466334376343477
Recall of the GB based on TFIDF:  0.8886393411940857
F-Score of the GB based on TFIDF:  0.848487828733548


In [25]:
gb_meta = GradientBoostingClassifier()
gb_meta.fit(x_tr_meta, tr_scores)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [27]:
gb_meta_pred = gb_meta.predict(x_tst_meta)
print("Accuracy of the GB based on metadata: ", accuracy_score(tst_scores, gb_meta_pred))
print("Precision of the GB based on metadata: ", precision_score(tst_scores, gb_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the GB based on metadata: ", recall_score(tst_scores, gb_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the GB based on metadata: ", f1_score(tst_scores, gb_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the GB based on metadata:  0.8941605839416058
Precision of the GB based on metadata:  0.8459565842460338
Recall of the GB based on metadata:  0.8941605839416058
F-Score of the GB based on metadata:  0.8530055380449761


### Random Forest

In [28]:
rf_comm = RandomForestClassifier()
rf_comm.fit(x_tr_comments, tr_scores)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [29]:
rf_comm_pred = rf_comm.predict(x_tst_comments)
print("Accuracy of the RF based on TFIDF: ", accuracy_score(tst_scores, rf_comm_pred))
print("Precision of the RF based on TFIDF: ", precision_score(tst_scores, rf_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the RF based on TFIDF: ", recall_score(tst_scores, rf_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the RF based on TFIDF: ", f1_score(tst_scores, rf_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the RF based on TFIDF:  0.884334643458731
Precision of the RF based on TFIDF:  0.8266330055898239
Recall of the RF based on TFIDF:  0.884334643458731
F-Score of the RF based on TFIDF:  0.8477974730891263


In [30]:
rf_meta = RandomForestClassifier()
rf_meta.fit(x_tr_meta, tr_scores)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
rf_meta_pred = rf_meta.predict(x_tst_meta)
print("Accuracy of the RF based on metadata: ", accuracy_score(tst_scores, rf_meta_pred))
print("Precision of the RF based on metadata: ", precision_score(tst_scores, rf_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the RF based on metadata: ", recall_score(tst_scores, rf_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the RF based on metadata: ", f1_score(tst_scores, rf_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the RF based on metadata:  0.8803106868800299
Precision of the RF based on metadata:  0.8295873716391459
Recall of the RF based on metadata:  0.8803106868800299
F-Score of the RF based on metadata:  0.8488999087316298
