In [1]:
# Import some common packages
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
# Load data
data = pd.read_json('../data/relationship_advice.json')
data['distinguished'] = data['distinguished'].fillna(value='none')

In [5]:
# Clean data and generate features
import afinn

afinn_nlp = afinn.Afinn(language='en', emoticons=True)
data['sentiment'] = data['body'].apply(lambda text: afinn_nlp.score(text))

data['word_count'] = data['body'].apply(lambda text: len(text.split(' ')))
data['char_count'] = data['body'].apply(lambda text: len(text))

def categorize(score):
    if score < 0:
        return 'UNPOPULAR'
    elif score >= 0 and score < 100:
        return 'UNREMARKABLE'
    elif score >= 100 and score < 500:
        return 'SOMEWHAT POPULAR'
    elif score >= 500 and score < 1000:
        return 'POPULAR'
    else:
        return 'VERY POPULAR'
data['score_cat'] = data['score'].apply(lambda score: categorize(score))

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english', sublinear_tf=True)
train_set, test_set = train_test_split(data, test_size = 0.8)

y_train, X_train = train_set['score_cat'], train_set['body']
y_test, X_test = test_set['score_cat'], test_set['body']

X_train = tf.fit_transform(X_train)
X_test = tf.transform(X_test)

In [40]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [41]:
dt.score(X_test, y_test)

0.8253749151907914

In [None]:
# Train/Test Split 
# Utilize Tfidf Vectorizer for comment text
# Vectorize ohter features seperately
# Classify using data solely from tfidf -> Random Forest
# Classify using metadata only -> Random Forest
# Use classifier output from tfidf as a feature for metadata classifier -> Random Forest