In [1]:
# Import some common packages
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Load data
data = pd.read_json('../data/relationship_advice.json')
data['distinguished'] = data['distinguished'].fillna(value='none')

In [3]:
# Clean data and generate features
import afinn

afinn_nlp = afinn.Afinn(language='en', emoticons=True)
data['sentiment'] = data['body'].apply(lambda text: afinn_nlp.score(text))

data['word_count'] = data['body'].apply(lambda text: len(text.split(' ')))
data['char_count'] = data['body'].apply(lambda text: len(text))

def categorize(score):
    if score < 0:
        return 'UNPOPULAR'
    elif score >= 0 and score < 100:
        return 'UNREMARKABLE'
    elif score >= 100 and score < 500:
        return 'SOMEWHAT POPULAR'
    elif score >= 500 and score < 1000:
        return 'POPULAR'
    else:
        return 'VERY POPULAR'
data['score_cat'] = data['score'].apply(lambda score: categorize(score))

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english', sublinear_tf=True)
train, test = train_test_split(data, test_size = 0.2)

tr_scores, tr_comments, tr_meta = train['score_cat'], train['body'], train.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])
tst_scores, tst_comments, tst_meta = test['score_cat'], test['body'], test.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

features = data.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score', 'score_cat'])
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), list(features.select_dtypes(include=[np.number]))),
    ('cat', OneHotEncoder(), list(features.select_dtypes(include=[np.object]).columns))
])

In [6]:
x_tr_comments = tf.fit_transform(tr_comments)
x_tst_comments = tf.transform(tst_comments)

x_tr_meta = full_pipeline.fit_transform(tr_meta)
x_tst_meta = full_pipeline.transform(tst_meta)

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dt_comm = DecisionTreeClassifier()
dt_comm.fit(x_tr_comments, tr_scores)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [8]:
comm_pred = dt_comm.predict(x_tst_comments)
print("Accuracy of the DT  based on TFIDF: ", accuracy_score(tst_scores, comm_pred))
print("Precision of the DT based on TFIDF: ", precision_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the DT based on TFIDF: ", recall_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the DT based on TFIDF: ", f1_score(tst_scores, comm_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the DT  based on TFIDF:  0.8303387609956953
Precision of the DT based on TFIDF:  0.8068945561605242
Recall of the DT based on TFIDF:  0.8303387609956953
F-Score of the DT based on TFIDF:  0.8182049231200411


In [9]:
dt_meta = DecisionTreeClassifier()
dt_meta.fit(x_tr_meta, tr_scores)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [10]:
meta_pred = dt_meta.predict(x_tst_meta)
print("Accuracy of the DT based on metadata: ", accuracy_score(tst_scores, meta_pred))
print("Precision of the DT based on metadata: ", precision_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the DT based on metadata: ", recall_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the DT based on metadata: ", f1_score(tst_scores, meta_pred, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the DT based on metadata:  0.8087216919333707
Precision of the DT based on metadata:  0.8122483659080996
Recall of the DT based on metadata:  0.8087216919333707
F-Score of the DT based on metadata:  0.8104699077816404
