In [1]:
# Import some common packages
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Load data
data = pd.read_json('../data/relationship_advice.json')
data['distinguished'] = data['distinguished'].fillna(value='none')

In [3]:
# Clean data and generate features
import afinn

afinn_nlp = afinn.Afinn(language='en', emoticons=True)
data['sentiment'] = data['body'].apply(lambda text: afinn_nlp.score(text))

data['word_count'] = data['body'].apply(lambda text: len(text.split(' ')))
data['char_count'] = data['body'].apply(lambda text: len(text))

def categorize(score):
    if score < 0:
        return 'UNPOPULAR'
    elif score >= 0 and score < 100:
        return 'UNREMARKABLE'
    elif score >= 100 and score < 500:
        return 'SOMEWHAT POPULAR'
    elif score >= 500 and score < 1000:
        return 'POPULAR'
    else:
        return 'VERY POPULAR'
data['score_cat'] = data['score'].apply(lambda score: categorize(score))

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english', sublinear_tf=True)
train_set, test_set = train_test_split(data, test_size = 0.2)

y_train, X_train = train_set['score_cat'], train_set['body']
y_test, X_test = test_set['score_cat'], test_set['body']

X_train = tf.fit_transform(X_train)
X_test = tf.transform(X_test)

In [5]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_predict = dt.predict(X_test)

print("Accuracy of the DT: ", accuracy_score(y_test, y_predict))
print("Precision of the DT: ", precision_score(y_test, y_predict, labels=list(set(data['score_cat'])), average='weighted'))
print("Recall of the DT: ", recall_score(y_test, y_predict, labels=list(set(data['score_cat'])), average='weighted'))
print("F-Score of the DT: ", f1_score(y_test, y_predict, labels=list(set(data['score_cat'])), average='weighted'))

Accuracy of the DT:  0.8320232079356167
Precision of the DT:  0.8092472812473345
Recall of the DT:  0.8320232079356167
F-Score of the DT:  0.8201943139436558


In [7]:
data.head()

Unnamed: 0,id,parent_id,top_level,depth,created_utc,time_dff,body,score,gilds,distinguished,sentiment,word_count,char_count,score_cat
0,fmr1a1f,t3_fwog3j,True,0,1586311571,31660,Thread's not getting removed; they worked with...,1,0,moderator,2.0,13,77,UNREMARKABLE
1,fmpody3,t3_fwog3j,True,0,1586284435,4524,Just imagine being your sons boyfriend and to ...,11749,1,none,10.0,60,306,VERY POPULAR
2,fmpfvc6,t3_fwog3j,True,0,1586280101,190,I’m so happy it worked out for you. I’ve been ...,3530,0,none,12.0,34,172,VERY POPULAR
3,fmpk2gj,t3_fwog3j,True,0,1586282236,2325,>Sorry if that isn't all as exciting and groun...,17208,4,none,4.0,25,136,VERY POPULAR
4,fmpgk5l,t3_fwog3j,True,0,1586280449,538,I'm happy to hear that this worked out so well...,2032,0,none,6.0,60,318,VERY POPULAR


In [8]:
data.drop(columns=['id', 'parent_id', 'top_level', 'created_utc', 'body', 'score'], inplace=True)
data.head()

Unnamed: 0,depth,time_dff,gilds,distinguished,sentiment,word_count,char_count,score_cat
0,0,31660,0,moderator,2.0,13,77,UNREMARKABLE
1,0,4524,1,none,10.0,60,306,VERY POPULAR
2,0,190,0,none,12.0,34,172,VERY POPULAR
3,0,2325,4,none,4.0,25,136,VERY POPULAR
4,0,538,0,none,6.0,60,318,VERY POPULAR


In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

train_set, test_set = train_test_split(data, test_size = 0.2)

y_train, X_train = train_set['score_cat'], train_set.drop(columns=['score_cat'])
y_test, X_test = test_set['score_cat'], test_set.drop(columns=['score_cat'])

In [10]:
data.drop(columns=['score_cat'], inplace=True)
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), list(data.select_dtypes(include=[np.number]))),
    ('cat', OneHotEncoder(), list(data.select_dtypes(include=[np.object]).columns))
])

In [11]:
X_train = full_pipeline.fit_transform(X_train)

In [12]:
X_test = full_pipeline.transform(X_test)

In [13]:
alt = DecisionTreeClassifier()
alt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [14]:
y_predict = alt.predict(X_test)

print("Accuracy of the DT: ", accuracy_score(y_test, y_predict))
print("Precision of the DT: ", precision_score(y_test, y_predict, labels=list(set(y_test)), average='weighted'))
print("Recall of the DT: ", recall_score(y_test, y_predict, labels=list(set(y_test)), average='weighted'))
print("F-Score of the DT: ", f1_score(y_test, y_predict, labels=list(set(y_test)), average='weighted'))

Accuracy of the DT:  0.808347370391166
Precision of the DT:  0.8113593491427431
Recall of the DT:  0.808347370391166
F-Score of the DT:  0.8098371205013164
