In [1]:
# Import some common packages
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Load data
data = pd.read_json('../data/relationship_advice.json')
data['distinguished'] = data['distinguished'].fillna(value='none')

In [3]:
# Functions for categorization criteria (see `playground.ipynb` for detailed explanation)

# compute boundaries
def compute_bounds(data):
    res  = dict()
    P_90 = np.percentile(data,90)
    # excluding forall d <= 0
    log_data = np.log([d for d in data if d > P_90])
    res['INSIGNIFICANT'] = np.exp(np.percentile(log_data, 0)) # equivalent to np.percentile(data, 90)
    res['NOTABLE']       = np.exp(np.percentile(log_data, 60))
    res['SIGNIFICANT']   = np.exp(np.percentile(log_data, 90))
    res['POPULAR']       = np.exp(np.percentile(log_data, 99.0))
    # implicit 'VERY POPULAR'
    return res
    
# autocategorization
def categorize(d, bounds):
    # try all categories
    for cat, val in bounds.items():
        if d < val:
            return cat
    # if at this point, VERY POPULAR
    return 'VERY POPULAR'

In [4]:
# Clean data and generate features
import afinn

afinn_nlp = afinn.Afinn(language='en', emoticons=True)
data['sentiment'] = data['body'].apply(lambda text: afinn_nlp.score(text))

data['word_count'] = data['body'].apply(lambda text: len(text.split(' ')))
data['char_count'] = data['body'].apply(lambda text: len(text))

# categorize using defined boundaries (lambda now accepts bounds param, which is a dict)
bounds = compute_bounds(data['score'])
data['score_cat'] = data['score'].apply(lambda score: categorize(score, bounds))


### DEPRECATED
# def categorize(score):
#     if score < 0:
#         return 'UNPOPULAR'
#     elif score >= 0 and score < 100:
#         return 'UNREMARKABLE'
#     elif score >= 100 and score < 500:
#         return 'SOMEWHAT POPULAR'
#     elif score >= 500 and score < 1000:
#         return 'POPULAR'
#     else:
#         return 'VERY POPULAR'
# data['score_cat'] = data['score'].apply(lambda score: categorize(score))

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english', sublinear_tf=True)
train, test = train_test_split(data, test_size = 0.2)

tr_scores, tr_comments, tr_meta = train['score_cat'], train['body'], train.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])
tst_scores, tst_comments, tst_meta = test['score_cat'], test['body'], test.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

features = data.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), list(features.select_dtypes(include=[np.number]))),
    ('cat', OneHotEncoder(), list(features.select_dtypes(include=[np.object]).columns))
])

In [7]:
x_tr_comments = tf.fit_transform(tr_comments)
x_tst_comments = tf.transform(tst_comments)

x_tr_meta = full_pipeline.fit_transform(tr_meta)
x_tst_meta = full_pipeline.transform(tst_meta)

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dt_comm = DecisionTreeClassifier(criterion='entropy')
dt_comm.fit(x_tr_comments, tr_scores)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [9]:
comm_pred = dt_comm.predict(x_tst_comments)
print("Accuracy of the DT  based on TFIDF: ", accuracy_score(tst_scores, comm_pred))
print("Precision of the DT based on TFIDF: ", precision_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the DT based on TFIDF: ", recall_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the DT based on TFIDF: ", f1_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the DT  based on TFIDF:  0.8471832303949093
Precision of the DT based on TFIDF:  0.8236240923016687
Recall of the DT based on TFIDF:  0.8471832303949093
F-Score of the DT based on TFIDF:  0.8350640828659


In [10]:
dt_meta = DecisionTreeClassifier()
dt_meta.fit(x_tr_meta, tr_scores)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [11]:
meta_pred = dt_meta.predict(x_tst_meta)
print("Accuracy of the DT based on metadata: ", accuracy_score(tst_scores, meta_pred))
print("Precision of the DT based on TFIDF: ", precision_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the DT based on TFIDF: ", recall_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the DT based on TFIDF: ", f1_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the DT based on metadata:  0.8323039490922702
Precision of the DT based on TFIDF:  0.8399249090218797
Recall of the DT based on TFIDF:  0.8323039490922702
F-Score of the DT based on TFIDF:  0.8360634007792708


#### Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_comm = RandomForestClassifier(criterion='entropy')
rf_comm.fit(x_tr_comments, tr_scores)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
rf_comm_pred = rf_comm.predict(x_tst_comments)
print("Accuracy of the RF based on TFIDF: ", accuracy_score(tst_scores, rf_comm_pred))
print("Precision of the RF based on TFIDF: ", precision_score(tst_scores, rf_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the RF based on TFIDF: ", recall_score(tst_scores, rf_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the RF based on TFIDF: ", f1_score(tst_scores, rf_comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the RF based on TFIDF:  0.9011791128579449
Precision of the RF based on TFIDF:  0.8248221972674527
Recall of the RF based on TFIDF:  0.9011791128579449
F-Score of the RF based on TFIDF:  0.8554432099835828


In [16]:
rf_meta = RandomForestClassifier()
rf_meta.fit(x_tr_meta, tr_scores)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
rf_meta_pred = rf_meta.predict(x_tst_meta)
print("Accuracy of the RF based on metadata: ", accuracy_score(tst_scores, rf_meta_pred))
print("Precision of the RF based on metadata: ", precision_score(tst_scores, rf_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the RF based on metadata: ", recall_score(tst_scores, rf_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the RF based on metadata: ", f1_score(tst_scores, rf_meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the RF based on metadata:  0.9000561482313307
Precision of the RF based on metadata:  0.846429791137966
Recall of the RF based on metadata:  0.9000561482313307
F-Score of the RF based on metadata:  0.8608338364312814
