In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle as pkl
from collections import Counter, defaultdict
from itertools import combinations

import numpy as np
import scipy as sp
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import linalg
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
import matplotlib.pyplot as plt

from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
def classifyandscore(X, y):
    clf = MLPClassifier(alpha=1, max_iter=1000)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=i)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    print('Precision: %.3f' % precision_score(y_test, y_pred)),
    print('Recall: %.3f' % recall_score(y_test, y_pred)),
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred)),
    print('F1 Score: %.3f' % f1_score(y_test, y_pred)),
    print('BAC: %.3f' % balanced_accuracy_score(y_test, y_pred))
    return clf

In [3]:
# get vector matrix for given set of ids
vector_size = 1024
def getVectorMat(ids):
    mat = np.zeros((len(ids), vector_size))
    for i in range(len(ids)):
        mat[i, :] = id2embed[ids[i]]
        
    print(mat.shape)
    return mat

In [4]:
# partition input output into labeled unlabeled data
def getLabeledUnlabeled(labeled_ids, full_ids):
    unlabeled_ids = list(set(full_ids).difference(set(labeled_ids)))
    print("total unlabeled data: ", len(unlabeled_ids))
    return unlabeled_ids

In [5]:
import numpy as np
import scipy.stats


__author__ = "Robert Munro"
__license__ = "MIT"
__version__ = "1.0.1"


def margin_confidence(prob_dist, sorted=False):
	""" Margin of Confidence Uncertainty Sampling
	Returns the uncertainty score of a probability distribution using
	margin of confidence sampling in a 0-1 range where 1 is the most uncertain
	
	Assumes probability distribution is a numpy 1d array like: 
		[0.0321, 0.6439, 0.0871, 0.2369]
		
	Keyword arguments:
		prob_dist -- a numpy array of real numbers between 0 and 1 that total to 1.0
		sorted -- if the probability distribution is pre-sorted from largest to smallest
	"""
	if not sorted:
		prob_dist[::-1].sort() # sort probs so that largest is at prob_dist[0]		
		
	difference = (prob_dist[0] - prob_dist[1])
	margin_conf = 1 - difference 
	
	return margin_conf
	

def ratio_confidence(prob_dist, sorted=False):
	"""Ratio of Confidence Uncertainty Sampling 
 
	Returns the uncertainty score of a probability distribution using
	ratio of confidence sampling in a 0-1 range where 1 is the most uncertain
	
	Assumes probability distribution is a numpy 1d array like: 
		[0.0321, 0.6439, 0.0871, 0.2369]
		
	Keyword arguments:
		prob_dist -- a numpy array of real numbers between 0 and 1 that total to 1.0
		sorted -- if the probability distribution is pre-sorted from largest to smallest
	"""
	if not sorted:
		prob_dist[::-1].sort() # sort probs so that largest is at prob_dist[0]		
		
	ratio_conf = prob_dist[1] / prob_dist[0]
	
	return ratio_conf
	
	


def least_confidence(prob_dist, sorted=False):
	""" Least Confidence Uncertainty Sampling 
	Returns the uncertainty score of a probability distribution using
	least confidence sampling in a 0-1 range where 1 is the most uncertain
	
	Assumes probability distribution is a numpy 1d array like: 
		[0.0321, 0.6439, 0.0871, 0.2369]
		
	Keyword arguments:
		prob_dist -- a numpy array of real numbers between 0 and 1 that total to 1.0
		sorted -- if the probability distribution is pre-sorted from largest to smallest
	"""
	if sorted:
		simple_least_conf = prob_dist[0] # most confident prediction
	else:
		simple_least_conf = np.nanmax(prob_dist) # most confident prediction, ignoring NaNs
				
	num_labels = float(prob_dist.size) # number of labels
	
	normalized_least_conf = (1 - simple_least_conf) * (num_labels / (num_labels -1))
	
	return normalized_least_conf



def entropy_score(prob_dist):
	""" Entropy-Based Uncertainty Sampling 
	Returns the uncertainty score of a probability distribution using
	entropy score
	
	Assumes probability distribution is a numpy 1d array like: 
		[0.0321, 0.6439, 0.0871, 0.2369]
		
	Keyword arguments:
		prob_dist -- a numpy array of real numbers between 0 and 1 that total to 1.0
		sorted -- if the probability distribution is pre-sorted from largest to smallest
	"""
	log_probs = prob_dist * np.log2(prob_dist) # multiply each probability by its base 2 log
	raw_entropy = 0-np.sum(log_probs)

	normalized_entropy = raw_entropy / np.log2(prob_dist.size)
	
	return normalized_entropy

In [6]:
with open("../big_data/full_data_bert_embeddings.pkl", "rb") as pfile:
    embedding = pkl.load(pfile)
    
with open("../big_data/full_data_bert_idlist.pkl", "rb") as ifile:
    idlist = pkl.load(ifile)
    
data = pd.read_csv("../big_data/full_data.csv", header=0)
id2text = data.set_index('id').to_dict()['title']

In [7]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,author,subreddit,author_flair_text,title,id,parent_id,link_id,created_utc,score
0,0,0,AndrewIsMyDog,QAnonCasualties,,"I was trying to look that up, thanks. I wasn...",houed9o,t1_houdwfg,t3_ri2h30,1639696241,1
1,1,1,OrsonPresence,QAnonCasualties,,Here’s a link to the PDF version https://www.i...,houe3w9,t3_ri2cfi,t3_ri2cfi,1639696129,1
2,2,2,Dralix001,QAnonCasualties,,"&gt;QAnon, Flat Earther (and obviously NASA di...",houdwfg,t3_ri2h30,t3_ri2h30,1639696040,1
3,3,3,NothingAndNow111,QAnonCasualties,,"Hey, that's great news! I hope he sticks out out.",houdhkq,t3_rglj3x,t3_rglj3x,1639695864,1
4,4,4,NothingAndNow111,QAnonCasualties,,"I'd be really wary, but... Hopefully this is t...",houd55q,t3_rh8j8y,t3_rh8j8y,1639695715,1


In [8]:
data.subreddit.unique()

array(['QAnonCasualties', 'AAQANON', 'ReQovery'], dtype=object)

In [9]:
data = data.loc[data['subreddit']=='ReQovery']

In [10]:
id2embed = defaultdict()

for i in range(len(idlist)):
    id2embed[idlist[i]] = embedding[i, :]

In [11]:
req_ids = data['id'].tolist()
req_embed = getVectorMat(req_ids)

(6264, 1024)


In [30]:
#remerge labeled data
newly_data = data.loc[data['id'].isin(labeled_ids)]
newly_data['bin_label'] = newly_data['id'].apply(lambda x: id2label[x])
len(newly_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newly_data['bin_label'] = newly_data['id'].apply(lambda x: id2label[x])


475

In [50]:
#newly_data.sample(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,author,subreddit,author_flair_text,title,id,parent_id,link_id,created_utc,score,bin_label
536801,536801,536801,heresyourtoll_troll,ReQovery,,I know this doesn’t really answer your questio...,gwq1o8x,t3_n38j8t,t3_n38j8t,1619998357,1,0
557051,17343,17343,Careless-Slice-6424,ReQovery,,Senior Thesis Help \n\nHello! I am looking fo...,mmy66b,mmy66b,mmy66b,1617906544,1,0
557038,17330,17330,Legitimate-Local-307,ReQovery,,Cross Post From Casualties,mynf19,mynf19,mynf19,1619403553,1,0
557169,17461,17461,Mr-internet,ReQovery,,Prochaska and Diclemente's stages of change mo...,kukghp,kukghp,kukghp,1610305505,1,0
557069,17361,17361,Truthout33,ReQovery,,The attack on the US Capitol was a display of ...,m4tqsg,m4tqsg,m4tqsg,1615722036,1,0


In [31]:
newly_data.to_csv("../lite_data/first_complete_annotated_data.csv")

In [32]:
labeled_data = pd.read_csv("../lite_data/first_complete_annotated_data.csv", header=0)
labeled_data = labeled_data.drop_duplicates(subset=['id'])
print(len(labeled_data))
#labeled_data['bin_label'] = labeled_data['label'].apply(lambda x: 1 if x=='recovery' else 0)
id2label  = labeled_data.set_index('id').to_dict()['bin_label']
labeled_ids = labeled_data['id'].tolist()

475


In [33]:
labeled_data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,author,subreddit,author_flair_text,title,id,parent_id,link_id,created_utc,score,bin_label
0,533861,533861,533861,andooet,ReQovery,,"Yes, but never giving blank slates to perceive...",hoi7agm,t1_hoi2x9p,t3_rfrmyn,1639486339,1,0
1,533866,533866,533866,Banana_Cake1,ReQovery,,Conspiratorial thinking is very addictive easy...,hohfbz0,t3_rfrmyn,t3_rfrmyn,1639465044,1,0
2,533880,533880,533880,JoeCormier,ReQovery,,"All the conspiracy theory stuff aside, how is ...",hofvxdw,t3_rfrmyn,t3_rfrmyn,1639437116,1,0
3,533882,533882,533882,ScarletSpire,ReQovery,,I'm a Jewish person and I just want to say tha...,hofukf7,t3_rfrmyn,t3_rfrmyn,1639436534,1,0
4,533904,533904,533904,FoxOnTheRun33050,ReQovery,,"I have my degree in psychology, and even I hav...",ho8qqqt,t3_re8ymy,t3_re8ymy,1639314862,1,0


## data insertion delection

In [34]:
print(len(idlist), len(set(idlist)))

557270 557270


In [35]:
labeled_vector_mat = getVectorMat(labeled_ids)

(475, 1024)


## train on labeled model

In [36]:
X = labeled_vector_mat
Y = [id2label[l] for l in labeled_ids]

In [37]:
trained_classifier = classifyandscore(X, Y)

Precision: 0.000
Recall: 0.000
Accuracy: 0.888
F1 Score: 0.000
BAC: 0.492


## get uncertainty scores for unlabeled data and top 50 to label

In [19]:
# get unlabeled mat
unlabeled_ids = getLabeledUnlabeled(labeled_ids, req_ids)
unlabeled_mat = getVectorMat(unlabeled_ids)
# unlabeled_ids = getLabeledUnlabeled(labeled_ids)
# unlabeled_mat = getVectorMat(unlabeled_ids)
print(unlabeled_mat.shape)

total unlabeled data:  5839
(5839, 1024)
(5839, 1024)


In [20]:
unlabed_probmat = trained_classifier.predict_proba(unlabeled_mat)

In [21]:
unlabed_probmat.shape

(5839, 2)

In [22]:
#get_entropy_scores
entropies = []
for i in range(unlabed_probmat.shape[0]):
    entropies.append(entropy_score(unlabed_probmat[i,:]))

In [23]:
identropy = pd.DataFrame()
identropy['id'] = unlabeled_ids
identropy['entropy'] = entropies

In [24]:
#to_label = identropy.sort_values(by=['entropy'], ascending=False).head(50)['id'].tolist()

## create labeler

In [25]:
to_label = identropy.sort_values(by=['entropy'], ascending=False).head(50)
to_label['title'] = to_label['id'].apply(lambda x: id2text[x])

In [26]:
import pandas as pd
import pigeonXT as pixt

annotations = pixt.annotate(to_label, example_column='title',
    options=['recovery', 'irrelevant', 'support', 'qadjacent', 'other_revovery']
)

HTML(value='0 of 50 Examples annotated, Current Position: 0 ')

VBox(children=(HBox(children=(Button(description='recovery', style=ButtonStyle()), Button(description='irrelev…

Output()

Annotation done.


In [27]:
annotations

Unnamed: 0,id,entropy,title,changed,label
12,gzumipt,0.999995,If you are interested check out Telltale on Yo...,True,irrelevant
1691,gjg7gep,0.999984,I just had a terrible thought: my 89 year old ...,True,qadjacent
1315,gl51hpb,0.999963,"You know, it's not so black and white. Part of...",True,irrelevant
5710,hjs18xm,0.999894,Member of The Satanic Temple here. As a Satani...,True,irrelevant
204,h75f8at,0.999797,Did you actually read the rules? Because users...,True,irrelevant
2428,gkzuyli,0.999746,"My journey towards Q was a long one, and it's ...",True,recovery
2808,hnrmgfw,0.999663,I discovered I had this problem with my husban...,True,irrelevant
1024,g1jngn1,0.999658,Thank you for this. One problem I’m running i...,True,irrelevant
1754,hofvxdw,0.999608,"All the conspiracy theory stuff aside, how is ...",True,irrelevant
3563,ginjdp5,0.99944,&gt;Anger feels like power. It's not. Anger is...,True,irrelevant


In [28]:
annotations['bin_label'] = annotations['label'].apply(lambda x: 1 if x=='recovery' else 0)

In [29]:
new_labids = annotations['id'].tolist()
new_id2label = annotations.set_index('id').to_dict()['bin_label']
labeled_ids = labeled_ids + new_labids
for n in new_labids:
    id2label[n] = new_id2label[n]