In [300]:
import pandas as pd
from utils import read_file, clean_twitter, stem
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import spacy
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [301]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [302]:
nlp = spacy.load('en_core_web_sm')
def get_nouns(sentence):
    doc = nlp(sentence)
    sentence = " ".join(
        [token.lemma_ if token.pos_ in ['NOUN', 'PROPN'] else ''
         for token in doc])
    return sentence

In [303]:
with open('stopwords.txt', 'r') as f:
    lines = f.readlines()
stopwords = [w.strip() for w in lines]

In [304]:
data, y = read_file('../hatespeech/', True)

In [305]:
df = pd.DataFrame(columns=['text', 'label'])
df['text'] = data
df['label'] = y

In [306]:
df['processed'] = df['text'].apply(lambda x: clean_twitter(x))

In [307]:
count_vectorizer = CountVectorizer(input='content',
                                   analyzer='word',
                                   strip_accents='ascii',
                                   ngram_range=(1,1),
                                   stop_words=stopwords)

In [308]:
count = count_vectorizer.fit_transform(df['processed'])
features = np.array(count_vectorizer.get_feature_names())
freq = count.copy()
count[count > 0] = 1

In [342]:
label=3
class_docs = count[df[df.label == label].index.to_numpy()]
rel_doc_freq = np.array(class_docs.sum(axis=0)/class_docs.shape[0])[0]
avg_freq = np.array(freq[df[df.label == label].index.to_numpy()].sum(axis=0)/class_docs.shape[0])[0]

In [343]:
rankingdf = pd.DataFrame(columns=['word', 'rel_doc_freq'])
rankingdf['word'] = features
rankingdf['rel_doc_freq'] = rel_doc_freq
rankingdf['avg_freq'] = avg_freq
rankingdf['idf'] = np.log(np.array(count.shape[0]/count.sum(axis=0))[0])

In [344]:
scaler = MinMaxScaler()
scaler.fit(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf[['rel_doc_freq','idf', 'avg_freq']] = scaler.transform(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf['comb'] = np.cbrt(rankingdf['rel_doc_freq'] * rankingdf['idf'] * rankingdf['avg_freq'])

In [345]:
rankingdf.sort_values(by=['comb'], ascending=False).head(50)

Unnamed: 0,word,rel_doc_freq,avg_freq,idf,comb
28326,hate,1.0,1.0,0.210953,0.59529
42104,nigga,0.674044,0.709981,0.324828,0.537687
42109,niggas,0.651911,0.644068,0.338768,0.522005
30363,idiot,0.338028,0.340866,0.276267,0.316924
7334,ass,0.315895,0.306968,0.196449,0.267072
61129,trump,0.287726,0.291902,0.215917,0.262725
9953,bitch,0.261569,0.263653,0.223946,0.249032
57347,stupid,0.22334,0.224105,0.282979,0.241949
45369,people,0.325956,0.323917,0.129783,0.239297
30366,idiots,0.201207,0.193974,0.344289,0.237741


In [337]:
0:love,people,time,day,life,thanks,happy,please,person,help,world,hope,news,game,night
1:free,video,join,check,win,click,live,available,download,enter,visit,online,fucked,chance,follow
2:fucked,ass,bitch,bad,shit,hate,stupid,idiot,ugly,bitches,pussy,dick,nasty,annoying,sex,fuckin
3:hate,nigga,idiot,ass,trump,syria,crazy,racist,disgusting,kill,muslims,evil,islam,isis,white

SyntaxError: invalid syntax (<ipython-input-337-29b3745188a7>, line 1)

In [355]:
print(np.random.choice(np.where(y == 3)[0], 500).tolist())

[23420, 31397, 26065, 66204, 11960, 22420, 62207, 36851, 11517, 61405, 20898, 23420, 72592, 15995, 23791, 2889, 56201, 74150, 46883, 48466, 5668, 83411, 56181, 82055, 63879, 60391, 82452, 15756, 29628, 35583, 10624, 34582, 75741, 45996, 29862, 77215, 2969, 82178, 3398, 50155, 61800, 68003, 44276, 20217, 23521, 36298, 28290, 46845, 36735, 37836, 75740, 28138, 32137, 13025, 2850, 70801, 29591, 497, 13759, 20010, 61633, 3398, 25871, 62888, 16828, 5470, 8991, 71111, 25349, 54352, 47366, 9113, 31397, 51553, 9466, 60278, 1077, 12810, 38591, 23429, 17828, 40780, 45299, 59438, 32311, 39957, 33036, 33199, 57758, 14024, 66243, 55642, 3752, 77170, 67442, 25189, 56177, 16882, 55041, 9593, 8208, 41255, 30144, 37469, 49736, 2711, 16595, 43701, 52547, 44704, 63496, 50657, 66097, 15479, 61482, 76153, 65681, 2804, 8208, 8751, 74019, 36068, 59372, 31687, 79322, 72592, 73566, 21785, 57903, 68591, 19102, 32580, 9248, 17848, 53338, 50004, 73997, 62477, 55999, 3937, 32535, 68798, 1053, 66870, 33618, 61239, 