In [1]:
import gc
import csv
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)

In [3]:
df = pd.read_csv("input/hatevocabraw.tsv", header=0, names=["term"], engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658 entries, 0 to 657
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   term    658 non-null    object
dtypes: object(1)
memory usage: 5.3+ KB


In [4]:
df.head()

Unnamed: 0,term
0,african
1,african american
2,alabama hot pocket
3,alaskan pipeline
4,american


In [5]:
terms = list(df["term"])
terms = [str(x).strip().lower() for x in terms]
terms = list(set(terms))
terms.sort()
print(f"len(terms)={len(terms)}\n{terms}")

len(terms)=576
['african', 'african american', 'alabama hot pocket', 'alaskan pipeline', 'american', 'anal', 'analplug', 'analsex', 'anilingus', 'anus', 'apeshit', 'arse', 'arsehole', 'asian', 'ass', 'assassin', 'asshole', 'assmunch', 'atheist', 'auto erotic', 'autoerotic', 'babeland', 'baby batter', 'baby juice', 'ball gag', 'ball gravy', 'ball kicking', 'ball licking', 'ball sack', 'ball sucking', 'balls', 'bangbros', 'bangbus', 'bareback', 'barely legal', 'barenaked', 'bastard', 'bastardo', 'bastinado', 'bbw', 'bdsm', 'beaner', 'beaners', 'beastiality', 'beaver cleaver', 'beaver lips', 'bestiality', 'bewb', 'big black', 'big breasts', 'big knockers', 'big tits', 'bimbo', 'bimbos', 'birdlock', 'bisexual', 'bitch', 'bitches', 'black', 'black cock', 'blind', 'blonde action', 'blonde on blonde action', 'bloody', 'bloodyhell', 'blow', 'blow job', 'blow your load', 'blowjob', 'blue waffle', 'blumpkin', 'bollocks', 'bondage', 'boner', 'boob', 'boobies', 'boobs', 'booty call', 'boy', 'brown

In [6]:
df = pd.read_parquet("input/pre_ruddit.parquet")
texts = list(df["text3"])
df = pd.read_parquet("input/pre_val.parquet")
texts += list(df["text3"])
del df
gc.collect()

0

In [7]:
%%time
vec = TfidfVectorizer(vocabulary=terms, ngram_range=(1, 3), analyzer="word")
vec = vec.fit(texts)
print(f"idf.shape={vec.idf_.shape}")
idf = vec.idf_.tolist()
with open("output/vocab.json", "w") as f:
    json.dump({
        "term": terms,
        "idf": idf,
    }, f)

idf.shape=(576,)
Wall time: 1.63 s


In [8]:
%%time
with open('output/vocab.tsv', 'w', newline='') as f:
    w = csv.writer(f, delimiter='\t')
    w.writerow(["term", "idf"])
    for i in range(len(terms)):
        w.writerow([terms[i], idf[i]])

Wall time: 2 ms
