In [4]:
import pandas as pd
from utils import read_file, clean_twitter, stem
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import spacy
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import pickle

In [5]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [6]:
with open('../hatespeech/pseudodocs.pkl', 'rb') as f:
    docs, labels, vocab = pickle.load(f)

In [12]:
eng_docs = []
for i in range(len(docs)):
    word_indices = docs[i]
    j = 0
    text = []
    while word_indices[j] != 0:
        text.append(vocab[word_indices[j]])
        j += 1
    eng_docs.append(" ".join(text))

In [16]:
labels = labels.argmax(axis=1)

In [26]:
gendf = pd.DataFrame(columns=['text', 'label'])
gendf['text'] = eng_docs
gendf['label'] = labels

In [27]:
with open('stopwords.txt', 'r') as f:
    lines = f.readlines()
stopwords = [w.strip() for w in lines]

In [28]:
count_vectorizer = CountVectorizer(input='content',
                                   analyzer='word',
                                   strip_accents='ascii',
                                   ngram_range=(1,1),
                                   stop_words=stopwords)

In [29]:
count = count_vectorizer.fit_transform(gendf['text'])
features = np.array(count_vectorizer.get_feature_names())
freq = count.copy()
count[count > 0] = 1

In [61]:
label=0
class_docs = count[gendf[gendf.label == label].index.to_numpy()]
rel_doc_freq = np.array(class_docs.sum(axis=0)/class_docs.shape[0])[0]
avg_freq = np.array(freq[gendf[gendf.label == label].index.to_numpy()].sum(axis=0)/class_docs.shape[0])[0]

In [62]:
rankingdf = pd.DataFrame(columns=['word', 'rel_doc_freq'])
rankingdf['word'] = features
rankingdf['rel_doc_freq'] = rel_doc_freq
rankingdf['avg_freq'] = avg_freq
rankingdf['idf'] = np.log(np.array(count.shape[0]/count.sum(axis=0))[0])

In [63]:
scaler = MinMaxScaler()
scaler.fit(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf[['rel_doc_freq','idf', 'avg_freq']] = scaler.transform(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf['comb'] = np.cbrt(rankingdf['rel_doc_freq'] * rankingdf['idf'] * rankingdf['avg_freq'])

In [64]:
rankingdf.sort_values(by=['comb'], ascending=False).head(50)

Unnamed: 0,word,rel_doc_freq,avg_freq,idf,comb
2792,motivation,1.0,1.0,0.188803,0.57368
2486,life,0.942529,0.940594,0.180638,0.543044
1039,counts,0.735632,0.732673,0.241756,0.50697
3658,sadness,0.666667,0.623762,0.262453,0.477888
3249,positivity,0.643678,0.613861,0.268827,0.473591
1518,excitement,0.643678,0.613861,0.268827,0.473591
4468,uni,0.597701,0.60396,0.282288,0.467084
2692,memories,0.632184,0.574257,0.2721,0.462267
4302,time,0.632184,0.653465,0.236251,0.460412
1150,day,0.586207,0.564356,0.268827,0.446367
