In [49]:
# Imports

# Essencials:

import pandas as pd
import nltk as nltk
import numpy as np
import html
import re
import math
import pickle
import joblib


# Sklearn:

import sklearn
from sklearn import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error


# Tensorflow

import tensorflow as tf
import keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer,SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize


from IPython.display import clear_output


In [29]:
# Displaying column width to read comment text

pd.set_option(
    'display.max_colwidth', 0
)
np.random.seed(42)

### NLP

In [30]:
# Download packages for sentiment analysis

nltk.download('subjectivity')
nltk.download('vader_lexicon')


[nltk_data] Downloading package subjectivity to
[nltk_data]     /Users/ekaterinaromanovskaya/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ekaterinaromanovskaya/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [31]:
# Reading data

df = pd.read_csv('hacker_news_sample.csv',nrows=100000)

In [32]:
# Data pre-process function

def preprocess_df(df):
    df = df[df['text'].notna()]
    df['neg'], df['neu'], df['pos'], df["compound"], df["subjectivity"] = [np.nan, np.nan,np.nan,np.nan,np.nan]    
    return df


In [None]:
# Adding subjectivity column to our data

def add_sentiment_subj(df):
    sid = SentimentIntensityAnalyzer()
    for i, row in df.iterrows():
        text = row["text"]
        text = html.unescape(text)
        text =  re.sub('<[^<]+?>', '', text)
        df.at[i,"text"] = text
        ss = sid.polarity_scores(text)
        for k in ss:
            df.at[i,k] = ss[k]

In [34]:
# Pre-processing and analysing data

df = preprocess_df(df)
add_sentiment_subj(df)


In [35]:
# Preparing data for training

X_train, X_test = train_test_split(df)


In [36]:
# I choose comments with non zero compaund 

scored = X_train.loc[X_train['compound'] != 0]

In [None]:
# Making a pipeline

pipeline = Pipeline(
    [
        ('vect', CountVectorizer(analyzer='word')),
        ('tfidf', TfidfTransformer()),    
    ]
)

In [37]:
X = pipeline.fit_transform(scored["text"])

In [38]:
# Saving pipeline

joblib.dump(pipeline, 'sklearn_pipeline.pkl')

['sklearn_pipeline.pkl']

## Decision Tree

In [39]:
# Building a model

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X, scored["compound"])

DecisionTreeRegressor(random_state=42)

In [40]:
scored_test = X_test.loc[X_test['compound'] != 0]
X2 = pipeline.transform(scored_test["text"])

print(scored_test["compound"].shape)
print(X2.shape)

(17270,)
(17270, 76827)


In [41]:
# Evaluating a model

predictions = tree_reg.predict(X2)

tree_mse = mean_squared_error(scored_test["compound"], predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.6391272003934664

In [44]:
# Saving a model

joblib.dump(tree_reg, 'tree_reg.pkl')

['tree_reg.pkl']

In [45]:
# Saving relevant pipeline

joblib.dump(pipeline, 'sklearn_pipeline_tree.pkl')

['sklearn_pipeline_tree.pkl']

## Keras 

In [21]:

# Building a neural network

EPOCHS = 10

def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[X.shape[1]]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

In [22]:
model = build_model()
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                4887488   
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 4,891,713
Trainable params: 4,891,713
Non-trainable params: 0
_________________________________________________________________


In [24]:
# Checkpoints to save the trained model

filepath = "keras/model.hdf5"
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=filepath,
    save_weights_only=False,
    monitor="loss",
    verbose=1,
    mode="min",
    save_best_only=True)
history = model.fit(
  X, scored["compound"],
  epochs=EPOCHS, validation_split = 0.2, verbose=1, callbacks=[checkpoint_callback])


Train on 41558 samples, validate on 10390 samples
Epoch 1/10
Epoch 00001: loss improved from inf to 0.01211, saving model to keras/model.hdf5
Epoch 2/10
Epoch 00002: loss improved from 0.01211 to 0.01083, saving model to keras/model.hdf5
Epoch 3/10
Epoch 00003: loss improved from 0.01083 to 0.00965, saving model to keras/model.hdf5
Epoch 4/10
Epoch 00004: loss improved from 0.00965 to 0.00855, saving model to keras/model.hdf5
Epoch 5/10
Epoch 00005: loss improved from 0.00855 to 0.00777, saving model to keras/model.hdf5
Epoch 6/10
Epoch 00006: loss improved from 0.00777 to 0.00704, saving model to keras/model.hdf5
Epoch 7/10
Epoch 00007: loss improved from 0.00704 to 0.00646, saving model to keras/model.hdf5
Epoch 8/10
Epoch 00008: loss improved from 0.00646 to 0.00610, saving model to keras/model.hdf5
Epoch 9/10
Epoch 00009: loss improved from 0.00610 to 0.00570, saving model to keras/model.hdf5
Epoch 10/10
Epoch 00010: loss improved from 0.00570 to 0.00524, saving model to keras/mode

In [None]:
# Making a prediction on random text

def predict_text(text, model = model):
    xtemp = pipeline.transform([text])
    predictions = model.predict(xtemp)
    print(predictions)
    return predictions



txt = "Reality is that while China blocks Facebook, Google, etc and smartly props up their own clones, it’s “aghast” at the American protectionism and xenophobic behaviour. How dare the Americans block a Chinese app?! China is not a democracy. It’s not interested in fairness. China is playing the long game. Just like the wars of the past were fought with little toy armies of a few thousand knights and noblemen marching into each other’s countries until someone decided to conscript their whole nation into battle, the West is fighting allowing China to pilfer its technology, wreak the environment, and compete with state backed organisations. Wanna compete with Huawei? Good luck sending in your company noblemen, China is sending their whole nation behind it."


preds = predict_text("Impressive! Personally, I am a few thousand places behind, but still in the top 0.2%. How? I asked and answered a few hundred questions early on, years ago, when SO was new and interesting... Now those answers are old and, like most answers on SO, out of date. Usually when I google something technical and get seemingly the exact right question asked on SO, the answers are no longer correct. Software versions change. And yet I still get a steady trickle of votes, forever increasing the gulf between outdated and no-longer-participating people like me and anyone starting on SO today.")


[[0.6620629]]


In [None]:
predictions = model.predict(X2)




In [None]:
tree_mse = mean_squared_error(scored_test["compound"], predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.39776707368686554

In [58]:
import os

os.getcwd()

'/Users/ekaterinaromanovskaya/Documents/LS/HN/DS/ML'

In [60]:
pd.read_csv('analysis/users_only_clean.csv')

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,username,saltiness_u,saltiness_words
0,11,11,yehaaa,0.000000,critic
1,65,65,ptrklly,0.095800,average user
2,68,68,crpatino,0.136356,average user
3,80,80,eigenrick,0.162533,average user
4,97,97,55873445216111,0.003902,average user
...,...,...,...,...,...
48846,498293,498293,arprocter,0.024166,average user
48847,498294,498294,simonh,0.063995,average user
48848,498295,498295,drewbug,0.067903,average user
48849,498296,498296,Causality1,0.012475,average user


In [61]:
users = pd.read_csv('analysis/users_only_clean.csv')

In [63]:
users = users.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [64]:
users.head(10)

Unnamed: 0,username,saltiness_u,saltiness_words
0,yehaaa,0.0,critic
1,ptrklly,0.0958,average user
2,crpatino,0.136356,average user
3,eigenrick,0.162533,average user
4,55873445216111,0.003902,average user
5,IsabellaDavey,0.295596,average user
6,nidhaloff,0.192519,average user
7,alexxozo,0.0,critic
8,benbojangles,0.117409,average user
9,Wh1zz,0.0,critic


In [69]:
users = users.rename(columns={'saltiness_u': 'saltiness_score'})

In [70]:
users.head()

Unnamed: 0,username,saltiness_score,saltiness_words
0,yehaaa,0.0,critic
1,ptrklly,0.0958,average user
2,crpatino,0.136356,average user
3,eigenrick,0.162533,average user
4,55873445216111,0.003902,average user


In [72]:
users.saltiness_words.value_counts()

average user    34473
critic          13761
happy user      460  
salty           146  
very salty      11   
Name: saltiness_words, dtype: int64

In [75]:
salty_trolls = users[users['saltiness_words'] == 'very salty']

In [76]:
salty_trolls

Unnamed: 0,username,saltiness_score,saltiness_words
4417,miller_joe,-0.8225,very salty
7025,hn2017,-0.8126,very salty
7064,senand,-0.7402,very salty
8125,ProbablyRyaan,-0.743,very salty
19926,J4M4I5M7,-0.8979,very salty
23467,cheeselip420,-0.7783,very salty
28841,dezmou,-0.7405,very salty
28855,turbine29,-0.7184,very salty
37165,theLastVoice,-0.75492,very salty
37470,siddlv,-0.7003,very salty


In [77]:
import seaborn as sns

In [84]:
users.shape


(48851, 3)