In [1]:
# Imports

# Essentials:

import pandas as pd
import nltk as nltk
import numpy as np
import html
import re
import math
import pickle
import joblib
import time
import pickle


# Sklearn:

import sklearn
from sklearn import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error


# Tensorflow:

import tensorflow as tf
import keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## NLP
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer,SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from textblob import TextBlob
import spacy
spacy.load('en_core_web_lg')
from spacy.lang.en import English

import gensim
from gensim import corpora
from gensim import models


from IPython.display import clear_output
import math



Using TensorFlow backend.


In [2]:
# Displaying column width to read comment text

pd.set_option(
    'display.max_colwidth', 0
)
np.random.seed(42)

### NLP

In [3]:
# Download packages for sentiment analysis
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/stepanboltalin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
# Reading data (I'm using sample data from 2014-2017)

df = pd.read_csv('../sh_data.csv', nrows=10000)

In [5]:
# Data pre-process function

def preprocess_df(df):
    df = df[df['comment'].notna()]
    df['neg'], df['neu'], df['pos'], df["compound"], df["subjectivity"] = [np.nan, np.nan,np.nan,np.nan,np.nan]    
    return df


In [6]:
# Adding sentiment analysis (pos / neu / neg / compound)


def add_sentiment(df):
  idx = 0
  sid = SentimentIntensityAnalyzer()
  for i, row in df.iterrows():
      text = row["comment"]
      text = html.unescape(text)
    #   text =  re.sub('<[^<]+?>', '', text)
    #   # these are needed for 2020 data
    #   text = re.sub('href=" rel="nofollow">', '', text) 
    #   text = re.sub('>', '', text) 
      ## end of 2020 additions
      df.at[i,"comment"] = re.sub('[^A-Za-z0-9\s\.\,]+', '', text)
      ss = sid.polarity_scores(text)
      for k in ss:
          df.at[i,k] = ss[k]
      if idx%100000 == 0 and (idx != 0):
        print(f"saving: {idx}")
        df.to_csv("./df_sent.csv")
      idx = idx  + 1

def apply_subj(input):
    text = str(input)
    sentiment = TextBlob(text)
    return sentiment.sentiment.subjectivity      

In [7]:
# Pre-processing and analysing data

df = preprocess_df(df)
df = df[df['comment'].notna()]
add_sentiment(df)
df["subjectivity"] = df["comment"].apply(lambda x: apply_subj(x))


In [8]:
df.head()

Unnamed: 0,username,comment,story_title,user_id,neg,neu,pos,compound,subjectivity
0,bradstewart,"Nobody in the semiconductor industry will put up with anyone in the supply chain making a 600 margin.Large Nvidia chips might not be able to move quickly, but all of the MCU, WiFi, etc chips would migrate to cheaper fabs like SMIC. Which reduces TSMCs revenue, which reduces their ability to invest in new processes. Cents and even fractions of cents matter to those chips.",‘Better Yield on 5nm Than 7nm’: TSMC Update on Defect Rates for N5,24272132,0.0,0.892,0.108,0.7469,0.526515
1,nabla9,You cant just create new fab with money.,‘Better Yield on 5nm Than 7nm’: TSMC Update on Defect Rates for N5,24272131,0.168,0.555,0.277,0.2928,0.454545
2,emteycz,they actually consider themselves to be the legitimate government of ChinaThats not the case since 1991.,‘Better Yield on 5nm Than 7nm’: TSMC Update on Defect Rates for N5,24272130,0.0,1.0,0.0,0.0,0.1
3,dgellow,"Lets say competing app stores wont be used. Then why is it an issue for Apple to allow them, given that they know they would still be the default and have the advantage Apple is in this situation at where control the platform completely, and could have the default store. But somehow thats not enough for them to consider opening the app market a little bit.Using that reasoning, they are in the best position to actually open to competition.","Apple ordered to not block Epic’s Unreal Engine, Fortnite to stay off App Store",24272129,0.0,0.903,0.097,0.7964,0.44
4,James_Henry,There is a shortage of truly random easily sampled noise though.,Challenge to scientists: does your ten-year-old code still run?,24272128,0.14,0.49,0.371,0.5106,0.666667


## Saltiness scores

In [10]:

df["saltiness"], df["saltiness_words_c"]  = [np.nan, np.nan]
df["saltiness_words_c"] = df["saltiness_words_c"].astype(str)

# Grind comments into salt
def grind_salt(compound, subj):
    return (compound * (subj*subj))

df["saltiness"] = df.apply(lambda x: grind_salt(df.compound, df.subjectivity))    

# Users only dataframe
user = pd.DataFrame()
user["saltiness_c"], user["saltiness_u"], user['saltiness_words_u'] = [[[]] * user.shape[0], np.nan, np.nan]
user["saltiness_words_u"] = user.saltiness_words_u.astype(str)


user["username"] = df['username']

# collect saltiness scores for each user
salty_users = df.groupby('username')['saltiness'].apply(list)

# not using saltiness arrays anymore
user["saltiness_c"] = user.username.apply(lambda x: salty_users[x])

# Satiness of a user
user["saltiness_u"] = user.saltiness_c.apply(lambda x: np.array(x).mean())

# Dividing users into groups by saltiness
def get_salt_words(number):
    if number < -0.7:
        return "very salty"
    if -0.4 <= number > -0.7:
        return "salty"
    if 0 <= number > -0.4:
        return "critic"
    if 0.5 <= number:
        return "happy user"
    else:
        return "average user"

user["saltiness_words_u"] = user.saltiness_u.apply(lambda x: get_salt_words(x))

# don't need Saltiness commments anymre
user = user.drop('saltiness_c', axis=1)
user = user.drop_duplicates(subset=['username'], keep='last')
#user.to_csv("users.csv")

# salty trolls
salty_tolls = user[user['saltiness_words_u'] == 'very salty']

# who are they?
salty_tolls


df = pd.merge(df, user, on="username")

# Dividing users into groups by saltiness
def get_salt_words_for_comments(number):
    if number < -0.7:
        return "mad"
    if -0.4 <= number > -0.7:
        return "annoyed"
    if 0 <= number > -0.4:
        return "critical"
    if 0.5 <= number:
        return "happy comment"
    else:
        return "regular comment"

df["saltiness_words_c"] = df.saltiness.apply(lambda x: get_salt_words_for_comments(x))


In [11]:
df.head()

Unnamed: 0,username,comment,story_title,user_id,neg,neu,pos,compound,subjectivity,saltiness,saltiness_words_c,saltiness_u_x,saltiness_words_u_x,saltiness_u_y,saltiness_words_u_y
0,bradstewart,"Nobody in the semiconductor industry will put up with anyone in the supply chain making a 600 margin.Large Nvidia chips might not be able to move quickly, but all of the MCU, WiFi, etc chips would migrate to cheaper fabs like SMIC. Which reduces TSMCs revenue, which reduces their ability to invest in new processes. Cents and even fractions of cents matter to those chips.",‘Better Yield on 5nm Than 7nm’: TSMC Update on Defect Rates for N5,24272132,0.0,0.892,0.108,0.7469,0.526515,0.207054,annoyed,0.207054,salty,0.207054,salty
1,nabla9,You cant just create new fab with money.,‘Better Yield on 5nm Than 7nm’: TSMC Update on Defect Rates for N5,24272131,0.168,0.555,0.277,0.2928,0.454545,0.060496,annoyed,0.133499,salty,0.133499,salty
2,nabla9,"Large volume manufacturing deals are negotiated sometimes years before, TSMC had no idea how much demand there was or how badly Intel fails when AMD, NVDA and Apple bought the capacity.New GIGAFAB takes time to build.",‘Better Yield on 5nm Than 7nm’: TSMC Update on Defect Rates for N5,24272108,0.231,0.769,0.0,-0.8225,0.39881,-0.130818,annoyed,0.133499,salty,0.133499,salty
3,nabla9,I have done several experiments in Reddit. I post a link with a plausible title and credible looking 404 to credible domain with made up url.In a big subreddit you easily get 30 top level comments before first comment says that link does not work or is fake. Discussion goes on and on without pause. People treat the title as a writing prompt. One of the best writing prompts is pseudophilosophical technobro life or business advice. Anything with quantum physics in the title works too.,Please read the paper before you comment,24270199,0.061,0.843,0.096,0.4664,0.496667,0.115051,annoyed,0.133499,salty,0.133499,salty
4,nabla9,"Plain English should not be confused with low literacy level. Plain English is a style that is hard to master. Only 12 of U.S. adults reach the highest PIAAC literacy proficiency levels.unnecessary verbosity is just one way to express your position in the society. Wearing impractical clothes, complex etiquette, having pale skin and speaking in certain manner was something that requires either wealth or practise.Today business jargon combined with clothes and behaviour still work today as signalling your identity. So is wearing hoodie and saying bro constantly. People learn to feel comfortable in the uniform and in the language they identify with.",How to Write in Plain English,24269879,0.032,0.861,0.107,0.7999,0.426209,0.145305,annoyed,0.133499,salty,0.133499,salty


## Decision Tree

In [11]:
# Making a pipeline

pipeline = Pipeline(
    [
        ('vect', CountVectorizer(analyzer='word')),
        ('tfidf', TfidfTransformer()),    
    ]
)

In [12]:
# Preparing data for training

X_train, X_test = train_test_split(df)


In [13]:
# I choose comments with non zero compaund 

scored = X_train.loc[X_train['saltiness'] != 0]

In [14]:
X = pipeline.fit_transform(scored["comment"])

In [15]:
# Saving pipeline
joblib.dump(pipeline, 'sklearn_pipeline.pkl')

['sklearn_pipeline.pkl']

In [16]:
# Building a model

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X, scored["saltiness"])

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best')

In [17]:
scored_test = X_test.loc[X_test['saltiness'] != 0]
X2 = pipeline.transform(scored_test["comment"])

print(scored_test["saltiness"].shape)
print(X2.shape)

(193,)
(193, 6264)


In [18]:
# Evaluating a model
predictions = tree_reg.predict(X2)

tree_mse = mean_squared_error(scored_test["saltiness"], predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.21609477027652224

In [19]:
# Saving a model

joblib.dump(tree_reg, 'tree_reg.pkl')

['tree_reg.pkl']

In [20]:
# Saving relevant pipeline

joblib.dump(pipeline, 'sklearn_pipeline_tree.pkl')

['sklearn_pipeline_tree.pkl']

In [21]:
# Making a prediction on random text

def predict_saltiness(text, model):
    xtemp = pipeline.transform([text])
    predictions = model.predict(xtemp)
    print(predictions)
    return predictions


In [22]:
# predict with 

preds = predict_saltiness("Impressive! Personally, I am a few thousand places behind, but still in the top 0.2%. How? I asked and answered a few hundred questions early on, years ago, when SO was new and interesting... Now those answers are old and, like most answers on SO, out of date. Usually when I google something technical and get seemingly the exact right question asked on SO, the answers are no longer correct. Software versions change. And yet I still get a steady trickle of votes, forever increasing the gulf between outdated and no-longer-participating people like me and anyone starting on SO today.", tree_reg)


get_salt_words_for_comments(preds[0])

[-0.2112649]


'annoyed'

## Keras 

In [23]:

# Building a neural network

EPOCHS = 10

def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[X.shape[1]]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

In [24]:
model = build_model()
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                400960    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 405,185
Trainable params: 405,185
Non-trainable params: 0
_________________________________________________________________


In [25]:
# Checkpoints to save the trained model

filepath = "model.hdf5"
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=filepath,
    save_weights_only=False,
    monitor="loss",
    verbose=1,
    mode="min",
    save_best_only=True)
history = model.fit(
  X, scored["saltiness"],
  epochs=EPOCHS, validation_split = 0.2, verbose=1, callbacks=[checkpoint_callback])


Train on 473 samples, validate on 119 samples
Epoch 1/10
Epoch 00001: loss improved from inf to 0.03350, saving model to model.hdf5
Epoch 2/10
Epoch 00002: loss improved from 0.03350 to 0.01983, saving model to model.hdf5
Epoch 3/10
Epoch 00003: loss improved from 0.01983 to 0.00934, saving model to model.hdf5
Epoch 4/10
Epoch 00004: loss improved from 0.00934 to 0.00495, saving model to model.hdf5
Epoch 5/10
Epoch 00005: loss improved from 0.00495 to 0.00353, saving model to model.hdf5
Epoch 6/10
Epoch 00006: loss improved from 0.00353 to 0.00259, saving model to model.hdf5
Epoch 7/10
Epoch 00007: loss improved from 0.00259 to 0.00215, saving model to model.hdf5
Epoch 8/10
Epoch 00008: loss improved from 0.00215 to 0.00195, saving model to model.hdf5
Epoch 9/10
Epoch 00009: loss improved from 0.00195 to 0.00140, saving model to model.hdf5
Epoch 10/10
Epoch 00010: loss improved from 0.00140 to 0.00133, saving model to model.hdf5


In [26]:
# predict with Keras

preds = predict_saltiness("Reality is that while China blocks Facebook, Google, etc and smartly props up their own clones, it’s “aghast” at the American protectionism and xenophobic behaviour. How dare the Americans block a Chinese app?! China is not a democracy. It’s not interested in fairness. China is playing the long game. Just like the wars of the past were fought with little toy armies of a few thousand knights and noblemen marching into each other’s countries until someone decided to conscript their whole nation into battle, the West is fighting allowing China to pilfer its technology, wreak the environment, and compete with state backed organisations. Wanna compete with Huawei? Good luck sending in your company noblemen, China is sending their whole nation behind it.", model)

get_salt_words_for_comments(preds[0])

[[0.00545485]]


'annoyed'

In [27]:
predictions = model.predict(X2)
keras_mse = mean_squared_error(scored_test["saltiness"], predictions)
keras_mse = np.sqrt(keras_mse)
keras_mse



0.1740777910270202

## Topic analysis

In [12]:


NUM_TOPICS = 20
en_stop = set(nltk.corpus.stopwords.words('english'))
parser = English()


def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens    



def add_tokens(df):
    sid = SentimentIntensityAnalyzer()
    for i, row in df.iterrows():
        text = row["comment"]
        tokens = prepare_text_for_lda(text)
        tok = list(tokens)
        df.at[i,"tokens"] = tokens    


df['tokens'] = [[]] * df.shape[0]
add_tokens(df)        

tokenslist = df["tokens"].tolist()
dictionary = corpora.Dictionary(tokenslist)
corpus = [dictionary.doc2bow(text) for text in tokenslist]

# pickle.dump(corpus, open('/content/drive/My Drive/Colab-Notebooks/Sentim/corpus.pkl', 'wb'))
# dictionary.save('/content/drive/My Drive/Colab-Notebooks/Sentim/dictionary.gensim')

commentslist = df.comment.to_list()
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
#ldamodel.save('/content/drive/My Drive/Colab-Notebooks/Sentim/model5.gensim')
topicslda = ldamodel.print_topics()



import warnings
warnings.filterwarnings("ignore")
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_lda = ldamodel[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

topz = []
for doc, as_text in zip(corpus_lda, commentslist):
    srt = sorted(doc, key=lambda sublist: abs(sublist[1]))
    srt.reverse()
    topz.append([srt[:3],as_text])

# Filling in 1 topic
topz_single = []
for topss in topz:
  if (len(topss[0]) > 0): 
    topz_single.append(topss[0][0])  
  else:
    topz_single.append([0,0])  

idx = 0
df['topic_top'] = np.nan
for top in topz_single:
  df.at[idx, "topic_top"] = top[0]
  idx = idx+1    


#df.to_csv("/content/drive/My Drive/Colab-Notebooks/Sentim/filled_1M.csv")  




In [13]:
df["main_topic"] = np.nan
df["main_topic"] = df["main_topic"].astype(str)

In [14]:
def this_topic(num):
    stri = re.sub('[^a-zA-Z]+', ' ',ldamodel.print_topic(num)).strip().capitalize()
    return str(stri)


df["main_topic"] = df.topic_top.apply(lambda x: this_topic(int(x)))

In [15]:
df[df["saltiness"] < -0.5]["main_topic"].apply(lambda x: print(re.sub('[^a-zA-Z]+', ' ', x).strip()))


Marriage would different think someone could something point computer console
American shell black editor movie emacs china country white third
School student would company could price state still large years
Package relnofollow type would number doctor times reason support medical
Police system building intelligence problem human trust solve humans could
Research years military framework javascript hear cancer still cause browser
People think things really would point something person better every
Would system write using language software project really change build
Package relnofollow type would number doctor times reason support medical
Article question comment answer paper someone reading sentence author actually
Article question comment answer paper someone reading sentence author actually
Package relnofollow type would number doctor times reason support medical
People would company think money problem could right business going
Research years military framework javascript hear c

116     None
809     None
1081    None
1131    None
2074    None
2169    None
2170    None
2424    None
2508    None
2792    None
3087    None
3358    None
3681    None
3747    None
3907    None
3966    None
4334    None
4872    None
4967    None
5108    None
5118    None
5253    None
5358    None
5483    None
5593    None
6335    None
6670    None
7099    None
7594    None
8081    None
8510    None
8626    None
9014    None
Name: main_topic, dtype: object

In [16]:
df2 = df[df['saltiness'] < -0.3]

In [38]:
df2.apply(lambda x: print(f"Topic: {x['main_topic']} invokes {x['saltiness_words_c'].upper()} comments from {x['saltiness_words_u'].upper()} kind of people"), axis=1) 


Topic: Would store apple linux developer still application steam game source invokes ANNOYED comments from SALTY kind of people
Topic: People think kindle amazon money stole device privacy would book invokes ANNOYED comments from SALTY kind of people
Topic: People would apple years talking every something problem point death invokes ANNOYED comments from SALTY kind of people
Topic: Would different think sentence apple article something point write pushup invokes REGULAR COMMENT comments from SALTY kind of people
Topic: Would state people tax think chips enough florida china country invokes ANNOYED comments from SALTY kind of people
Topic: Would state people tax think chips enough florida china country invokes ANNOYED comments from SALTY kind of people
Topic: Apple user revenue company think would developer article choice people invokes ANNOYED comments from SALTY kind of people
Topic: People would business manager public feature interface amazon something user invokes ANNOYED comments 

51     None
76     None
103    None
287    None
288    None
359    None
381    None
391    None
463    None
500    None
512    None
607    None
615    None
646    None
909    None
922    None
957    None
960    None
975    None
976    None
dtype: object

In [17]:
df.to_csv("clean_10k_topics.csv")

In [34]:
get_ipython().system('pip install pyLDAvis')




In [35]:
import pyLDAvis.gensim

lda_disp = pyLDAvis.gensim.prepare(ldamodel,corpus,dictionary)
pyLDAvis.display(lda_disp)
