In [1]:
import gc
import csv
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from tqdm import tqdm

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
nlp = spacy.load("en_core_web_lg", exclude=["textcat"])
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
df = pd.read_csv("data/vad.tsv", header=0, names=["term", "valence", "arousal", "dominance"], 
                 sep="\t", engine="c", low_memory=False)
cols = ["valence", "arousal", "dominance"]
df[cols] = df[cols].astype(np.float16)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19970 entries, 0 to 19969
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   term       19969 non-null  object 
 1   valence    19970 non-null  float16
 2   arousal    19970 non-null  float16
 3   dominance  19970 non-null  float16
dtypes: float16(3), object(1)
memory usage: 273.2+ KB


In [5]:
df.head()

Unnamed: 0,term,valence,arousal,dominance
0,aaaah,0.52002,0.63623,0.281982
1,aardvark,0.427002,0.48999,0.437012
2,aback,0.38501,0.406982,0.288086
3,abacus,0.509766,0.275879,0.485107
4,abalone,0.5,0.47998,0.412109


In [6]:
terms = []
for t in tqdm(df.itertuples()):
    v = getattr(t, "valence")
    a = getattr(t, "arousal")
    if v < 0.25 or a > 0.75:
        terms.append(getattr(t, "term"))
print(f"len(terms)={len(terms)}")

19970it [00:00, 798857.90it/s]

len(terms)=4000





In [7]:
df = pd.read_csv("input/hatevocabraw.tsv", header=0, names=["term"], engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658 entries, 0 to 657
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   term    658 non-null    object
dtypes: object(1)
memory usage: 5.3+ KB


In [8]:
df.head()

Unnamed: 0,term
0,african
1,african american
2,alabama hot pocket
3,alaskan pipeline
4,american


In [9]:
terms += list(df["term"])
terms = [str(t).strip() for t in terms]
tmp = []
for doc in nlp.pipe(terms):
    if len(doc) > 3:
        continue
    t = " ".join([token.lemma_ for token in doc])
    if len(t) < 3:
        continue
    t = t.lower()
    tmp.append(t)
terms = list(set(tmp))
terms.sort()
print(f"len(terms)={len(terms)}\n{terms}")

len(terms)=4098
['abandon', 'abandonment', 'abash', 'abduct', 'abduction', 'aberrant', 'aberration', 'abhor', 'abhorrence', 'abhorrent', 'ablaze', 'abnormal', 'abnormality', 'abominable', 'abominate', 'abomination', 'abort', 'abortion', 'abortive', 'abrupt', 'abruptly', 'abscess', 'absence', 'absent', 'absentee', 'absurd', 'absurdity', 'abuse', 'abusive', 'accelerant', 'accelerate', 'acceleration', 'accelerator', 'accident', 'accidental', 'acclaim', 'accurse', 'accusation', 'accusative', 'accuse', 'accuser', 'ache', 'achievement', 'achy', 'acidity', 'acne', 'acrobat', 'action', 'action figure', 'activation', 'addict', 'addicted', 'addiction', 'adrenalin', 'adrenaline', 'adulterate', 'adulterer', 'adultery', 'adventure', 'adventurer', 'adventurous', 'adversary', 'adversity', 'aerobic', 'afire', 'afraid', 'african', 'african american', 'age', 'aged', 'agglomeration', 'aggravate', 'aggravating', 'aggravation', 'aggresive', 'aggression', 'aggressive', 'aggressively', 'aggressiveness', 'agg

In [10]:
df = pd.read_parquet("input/pre_ruddit.parquet")
texts = list(df["text3"])
df = pd.read_parquet("input/pre_val.parquet")
texts += list(df["text3"])
del df
gc.collect()

767

In [11]:
%%time
vec = TfidfVectorizer(vocabulary=terms, ngram_range=(1, 3), analyzer="word")
vec = vec.fit(texts)
print(f"idf.shape={vec.idf_.shape}")
idf = vec.idf_.tolist()
with open("output/vocab.json", "w") as f:
    json.dump({
        "term": terms,
        "idf": idf,
    }, f)

idf.shape=(4098,)
Wall time: 1.74 s


In [12]:
%%time
with open('output/vocab.tsv', 'w', newline='') as f:
    w = csv.writer(f, delimiter='\t')
    w.writerow(["term", "idf"])
    for i in range(len(terms)):
        w.writerow([terms[i], idf[i]])

Wall time: 11 ms
