In [233]:
import pandas as pd
from utils import read_file, clean_twitter, stem
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import spacy
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [234]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [235]:
nlp = spacy.load('en_core_web_sm')
def get_nouns(sentence):
    doc = nlp(sentence)
    sentence = " ".join(
        [token.lemma_ if token.pos_ in ['NOUN', 'PROPN'] else ''
         for token in doc])
    return sentence

In [236]:
with open('stopwords.txt', 'r') as f:
    lines = f.readlines()
stopwords = [w.strip() for w in lines]

In [237]:
data, y = read_file('../hatespeech/', True)

In [238]:
df = pd.DataFrame(columns=['text', 'label'])
df['text'] = data
df['label'] = y

In [239]:
df['processed'] = df['text'].apply(lambda x: clean_twitter(x))

In [240]:
count_vectorizer = CountVectorizer(input='content',
                                   analyzer='word',
                                   strip_accents='ascii',
                                   ngram_range=(1,1),
                                   stop_words=stopwords)

In [241]:
count = count_vectorizer.fit_transform(df['processed'])
features = np.array(count_vectorizer.get_feature_names())
freq = count.copy()
count[count > 0] = 1

In [270]:
label=2
class_docs = count[df[df.label == label].index.to_numpy()]
rel_doc_freq = np.array(class_docs.sum(axis=0)/class_docs.shape[0])[0]
avg_freq = np.array(freq[df[df.label == label].index.to_numpy()].sum(axis=0)/class_docs.shape[0])[0]

In [271]:
rankingdf = pd.DataFrame(columns=['word', 'rel_doc_freq'])
rankingdf['word'] = features
rankingdf['rel_doc_freq'] = rel_doc_freq
rankingdf['avg_freq'] = avg_freq
rankingdf['idf'] = np.log(np.array(count.shape[0]/count.sum(axis=0))[0])

In [272]:
scaler = MinMaxScaler()
scaler.fit(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf[['rel_doc_freq','idf', 'avg_freq']] = scaler.transform(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf['comb'] = np.cbrt(rankingdf['rel_doc_freq'] * rankingdf['idf'] * rankingdf['avg_freq'])

In [273]:
rankingdf.sort_values(by=['comb'], ascending=False).head(50)

Unnamed: 0,word,rel_doc_freq,avg_freq,idf,comb
25146,fucked,0.288966,0.282446,0.131345,0.220495
25138,fuck,0.140186,0.141056,0.211672,0.161158
7334,ass,0.143902,0.145966,0.196449,0.160395
9953,bitch,0.12288,0.125767,0.223946,0.151262
19736,don,0.121022,0.12454,0.112985,0.119417
53989,shit,0.080256,0.078897,0.262173,0.118406
8220,bad,0.080256,0.080013,0.211113,0.110675
28326,hate,0.077236,0.078005,0.210953,0.10832
57347,stupid,0.065273,0.065171,0.282979,0.106377
30363,idiot,0.063647,0.064167,0.276267,0.104105
