In [None]:
!pip install nltk==3.5
!pip install pandas==0.25.0
!pip install scipy==1.4.1
!pip install gensim==3.7.3
!pip install numpy==1.16.2
!pip install scikit_learn==0.23.2


In [None]:
import pandas as pd
import gensim 
from gensim.models import Word2Vec
import numpy as np
import nltk
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from scipy import spatial
from sklearn.cluster import KMeans
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

In [None]:

sid = SentimentIntensityAnalyzer()
def TrainModel(csv_document, csv_comment_column, outputname='outputModel', window = 4, minf=10, epochs=100, ndim=200, lemmatiseFirst = False, verbose = True):

	def data_processing(path, column = csv_comment_column, nrowss=None, verbose = True):
		
		print("is this line")
		trpCom = pd.read_csv(path,engine ='python')
		print("here")
		print(col for col in trpCom.columns)
		documents = []
		for i, row in enumerate(trpCom[column]):
			

			if i%500000 == 0 and verbose == True:
				print('\t...processing line {}'.format(i))
			try:
				pp = gensim.utils.simple_preprocess (row)
				if(lemmatiseFirst == True):
					pp = [wordnet_lemmatizer.lemmatize(w, pos="n") for w in pp]
				documents.append(pp)
			except:
				if(verbose):
					print('\terror with row {}'.format(row))
		print('Done reading all documents')
		return documents

	def training(documents, outputfile, ndim, window, minfreq, epochss):
	
		starttime = time.time()
		print('->->Starting training model {} with dimensions:{}, minf:{}, epochs:{}'.format(outputfile,ndim, minfreq, epochss))
		model = gensim.models.Word2Vec (documents, size=ndim, window=window, min_count=minfreq, workers=5)
		model.train(documents,total_examples=len(documents),epochs=epochss)
		model.save(outputfile)
		print('->-> Model saved in {}'.format(outputfile))     
   
	print('->Starting with {} [{}], output {}, window {}, minf {}, epochs {}, ndim {}'.format(csv_document,csv_comment_column,outputname, window, minf, epochs, ndim))
	docs = data_processing(csv_document, csv_comment_column, nrowss=None, verbose=verbose)
	starttime = time.time()
	print('-> Output will be saved in {}'.format(outputname))
	training(docs, outputname, ndim, window, minf, epochs)
	print('-> Model creation ended in {} seconds'.format(time.time()-starttime))




In [None]:
def getCosineDistance(embedding1, embedding2):       
	return spatial.distance.cosine(embedding1, embedding2)

In [None]:
import matplotlib.pyplot as plt
def Topbiaswords(modelpath, topk, c1, c2, pos = ['JJ','JJR','JJS'], verbose = True):

	def calculateCentroid(model, words):
		embeddings = [np.array(model[w]) for w in words if w in model]
		centroid = np.zeros(len(embeddings[0]))
		for e in embeddings:
			centroid += e
		return centroid/len(embeddings)

	#select the interesting subset of words based on pos
	model = Word2Vec.load(modelpath)
	words_sorted = sorted( [(k,v.index, v.count) for (k,v) in model.wv.vocab.items()] ,  key=lambda x: x[1], reverse=False)
	words = [w for w in words_sorted if nltk.pos_tag([w[0]])[0][1] in pos]

	if len(c1) < 1 or len(c2) < 1 or len(words) < 1:
		print('[!] Not enough word concepts to perform the experiment')
		return None

	centroid1, centroid2 = calculateCentroid(model, c1),calculateCentroid(model, c2)
	winfo = []
	for i, w in enumerate(words):
		word = w[0]
		freq = w[2]
		rank = w[1]
		pos = nltk.pos_tag([word])[0][1]
		wv = model[word]
		sent = sid.polarity_scores(word)['compound']
		#estimate cosinedistance diff
		d1 = getCosineDistance(centroid1, wv)
		d2 = getCosineDistance(centroid2, wv)
		bias = d2-d1

		winfo.append({'word':word, 'bias':bias, 'freq':freq, 'pos':pos, 'wv':wv, 'rank':rank, 'sent':sent} )

		if(i%100 == 0 and verbose == True):
			print('...'+str(i), end="")

	#Get max and min topk biased words...
	biasc1 = sorted( winfo, key=lambda x:x['bias'], reverse=True )[:min(len(winfo), topk)]
	biasc2 = sorted( winfo, key=lambda x:x['bias'], reverse=False )[:min(len(winfo), topk)]
    #move the ts2 bias to the positive space
	for w2 in biasc2:
		w2['bias'] = w2['bias']*-1
    
	return [biasc1, biasc2]


def Cluster(biasc1, biasc2, r, repeatk, verbose = True):

	def getIntraSim(partition):
		iS = 0
		for cluster in partition:
			iS += getIntraSimCluster(cluster)
		return iS/len(partition)
	def getIntraSimCluster(cluster):
		if(len(cluster)==1):
			return 0
		sim = 0; c = 0
		for i in range(len(cluster)):
			w1 = cluster[i]['wv']
			for j in range(i+1, len(cluster)):
				w2 = cluster[j]['wv']
				sim+= 1-getCosineDistance(w1,w2)
				c+=1
		return sim/c
	def createPartition(embeddings, biasw, k):
		
		preds = KMeans(n_clusters=k,random_state=0).fit_predict(embeddings)
		all_clusters = []
		for i in range(0, k):
			clust = []
			indexes = np.where(preds == i)[0]
			for idx in indexes:
				clust.append(biasw[idx])
			all_clusters.append(clust)
		score = getIntraSim(all_clusters)
		return [score, all_clusters]


	k = int(r * (len(biasc1)+len(biasc2))/2)
	emb1, emb2  = [w['wv'] for w in biasc1], [w['wv'] for w in biasc2]
	mis1, mis2 = [0,[]], [0,[]]	
	for run in range(repeatk):
		p1 = createPartition(emb1, biasc1, k)
		if(p1[0] > mis1[0]):
			mis1 = p1
		p2 = createPartition(emb2, biasc2, k)
		if(p2[0] > mis2[0]):
			mis2 = p2
		if(verbose == True):
			print('New partition for ts1, intrasim: ', p1[0])
			print('New partition for ts2, intrasim: ', p2[0])

	print('[*] Intrasim of best partition found for ts1, ', mis1[0])
	print('[*] Intrasim of best partition found for ts2, ', mis2[0])
	return [mis1[1], mis2[1]]
		



In [None]:

csvpath = 'comments_small.csv'
outputpath = 'redpill_model'
column='body'
print('Training new model', csvpath)
TrainModel(csvpath, column, outputname = outputpath, window = 4, minf = 10, epochs = 5, ndim = 200, verbose = False)
print('Training finished, saved ', outputpath)



Training new model comments_small.csv
->Starting with comments_small.csv [body], output redpill_model, window 4, minf 10, epochs 5, ndim 200
is this line
here
<generator object TrainModel.<locals>.data_processing.<locals>.<genexpr> at 0x7f57d1cbb050>
Done reading all documents
-> Output will be saved in redpill_model
->->Starting training model redpill_model with dimensions:200, minf:10, epochs:5
->-> Model saved in redpill_model
-> Model creation ended in 723.3287553787231 seconds
Training finished, saved  redpill_model


In [None]:
print('Finding biases...')
female=["sister" , "female" , "woman" , "girl" , "daughter" , "she" , "hers" , "her", "wife", "mother", "grandmother"]
male=["brother" , "male" , "man" , "boy" , "son" , "he" , "his" , "him", "husband", "father", "grandfather"]  
[b1,b2] = Topbiaswords(outputpath, 300,	female,	male,	['JJ','JJR','JJS'], verbose = False)		

print('Biased towards ', female)
print([b['word'] for b in b1])

print('Biased towards ', male)
print([b['word'] for b in b2])

print()
print('Clustering words into concepts...')
[cl1,cl2] = Cluster(b1, b2, 0.15, 10,	verbose = False)	

print('Resulting clusters')
print('Clusters biased towards ', female)
for cluster in cl1:
    print( [k['word'] for k in cluster] )

print('Clusters biased towards ', male)
for cluster in cl2:
    print( [k['word'] for k in cluster] )



Finding biases...


  """
  """


Biased towards  ['sister', 'female', 'woman', 'girl', 'daughter', 'she', 'hers', 'her', 'wife', 'mother', 'grandmother']
['okcupid', 'unicorn', 'casual', 'bangable', 'fuckable', 'superficial', 'lesbian', 'unreasonable', 'anal', 'hideous', 'solipsistic', 'flirtatious', 'naked', 'hypergamous', 'visual', 'overdrive', 'interracial', 'nonsensical', 'third', 'unplanned', 'plausible', 'undressed', 'hypocritical', 'unhealthy', 'colombian', 'clitoral', 'plausable', 'broad', 'negotiable', 'sugary', 'exclusive', 'nasty', 'soluble', 'oral', 'arbitrary', 'narcissitic', 'compatible', 'unfuckable', 'taiwanese', 'virginal', 'vaginal', 'unprocessed', 'subconscious', 'outrageous', 'obese', 'sequential', 'pathological', 'radical', 'kcal', 'normal', 'voluptuous', 'polish', 'precious', 'trivial', 'sensical', 'babble', 'sloppy', 'second', 'deniable', 'gross', 'hot', 'subcutaneous', 'nonverbal', 'tolerable', 'cautious', 'unannounced', 'unsolicited', 'fewest', 'neurochemical', 'undivided', 'interested', 'stat

In [None]:
for b in b1:
  print("bias towards women")
  # word, frequency of the word, bias measure, word sentiment
  print("word ",b['word']," freq ",b['freq']," bias ",b['bias'],"sentiment",b['sent'])

bias towards women
word  okcupid  freq  1087  bias  0.14090309689473668 sentiment 0.0
bias towards women
word  unicorn  freq  2863  bias  0.13279457382091087 sentiment 0.0
bias towards women
word  casual  freq  2639  bias  0.1321481470318121 sentiment 0.2023
bias towards women
word  bangable  freq  73  bias  0.12432564358265108 sentiment 0.0
bias towards women
word  fuckable  freq  307  bias  0.12249617770761856 sentiment 0.0
bias towards women
word  superficial  freq  760  bias  0.11986564872689831 sentiment 0.0
bias towards women
word  lesbian  freq  1477  bias  0.11532931746786146 sentiment 0.0
bias towards women
word  unreasonable  freq  711  bias  0.11308478013599832 sentiment 0.0
bias towards women
word  anal  freq  1004  bias  0.11280363329318355 sentiment 0.0
bias towards women
word  hideous  freq  292  bias  0.111121308500436 sentiment 0.0
bias towards women
word  solipsistic  freq  595  bias  0.11056562002987891 sentiment 0.0
bias towards women
word  flirtatious  freq  144  b

In [None]:
for b in b2:
  print("bias towards men")
  print("word ",b['word']," freq ",b['freq']," bias ",b['bias'],"sentiment",b['sent'])

bias towards men
word  ian  freq  143  bias  0.17453040672141973 sentiment 0.0
bias towards men
word  quintessential  freq  73  bias  0.1713285236791673 sentiment 0.0
bias towards men
word  ultimate  freq  2001  bias  0.15713466220504635 sentiment 0.0
bias towards men
word  leary  freq  24  bias  0.15176854928040462 sentiment 0.0
bias towards men
word  duncan  freq  23  bias  0.14462930292111442 sentiment 0.0
bias towards men
word  prophetic  freq  22  bias  0.1445437716424205 sentiment 0.0
bias towards men
word  enigmatic  freq  16  bias  0.14316602387004151 sentiment 0.0
bias towards men
word  adrian  freq  60  bias  0.1386920025137952 sentiment 0.0
bias towards men
word  salacious  freq  10  bias  0.129340542847725 sentiment 0.0
bias towards men
word  courageous  freq  115  bias  0.1292963165464639 sentiment 0.5267
bias towards men
word  tyrese  freq  11  bias  0.12663344252716358 sentiment 0.0
bias towards men
word  visionary  freq  20  bias  0.12442429473240446 sentiment 0.5267
bi