In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import names
from tqdm import tqdm

In [2]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('names')
nltk.data.path

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


['C:\\Users\\USER/nltk_data',
 'S:\\dev\\seahrh\\nlp-examples\\env\\nltk_data',
 'S:\\dev\\seahrh\\nlp-examples\\env\\share\\nltk_data',
 'S:\\dev\\seahrh\\nlp-examples\\env\\lib\\nltk_data',
 'C:\\Users\\USER\\AppData\\Roaming\\nltk_data',
 'C:\\nltk_data',
 'D:\\nltk_data',
 'E:\\nltk_data']

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
df = pd.read_csv("input/formal_en.tsv", sep="\t", header=0, engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202023 entries, 0 to 202022
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Word    202021 non-null  object
 1   Count   202023 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.1+ MB


In [5]:
black = set(w.lower() for w in names.words())
for t in df.itertuples():
    w = getattr(t, "Word")
    c = getattr(t, "Count")
    if c >= 3:
        black.add(w)
print(f"len(black)={len(black):,}")

len(black)=49,988


In [6]:
df = pd.read_csv("input/urbandict-word-defs.csv", on_bad_lines="warn", engine="c", low_memory=False)
df.info()

b'Skipping line 7197: expected 6 fields, saw 7\nSkipping line 9757: expected 6 fields, saw 7\nSkipping line 13349: expected 6 fields, saw 7\nSkipping line 19999: expected 6 fields, saw 7\nSkipping line 20087: expected 6 fields, saw 7\nSkipping line 21775: expected 6 fields, saw 8\nSkipping line 23825: expected 6 fields, saw 8\nSkipping line 25254: expected 6 fields, saw 7\nSkipping line 25642: expected 6 fields, saw 7\nSkipping line 25776: expected 6 fields, saw 7\nSkipping line 30964: expected 6 fields, saw 7\nSkipping line 35484: expected 6 fields, saw 7\nSkipping line 36021: expected 6 fields, saw 8\nSkipping line 36071: expected 6 fields, saw 7\nSkipping line 40151: expected 6 fields, saw 7\nSkipping line 40694: expected 6 fields, saw 7\nSkipping line 41941: expected 6 fields, saw 7\nSkipping line 43659: expected 6 fields, saw 7\nSkipping line 46528: expected 6 fields, saw 7\nSkipping line 48481: expected 6 fields, saw 7\nSkipping line 49276: expected 6 fields, saw 7\nSkipping line

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580587 entries, 0 to 2580586
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   word_id     int64 
 1   word        object
 2   up_votes    int64 
 3   down_votes  int64 
 4   author      object
 5   definition  object
dtypes: int64(3), object(3)
memory usage: 118.1+ MB


In [7]:
cols = ["up_votes", "down_votes"]
df[cols].describe(percentiles=percentiles)

Unnamed: 0,up_votes,down_votes
count,2580587.0,2580587.0
mean,65.18388,42.10474
std,563.2683,281.7101
min,-42.0,-29.0
1%,0.0,0.0
5%,0.0,0.0
10%,1.0,0.0
20%,2.0,1.0
30%,4.0,2.0
40%,5.0,3.0


In [8]:
keep = {}
for t in tqdm(df.itertuples()):
    w = str(getattr(t, "word")).strip().lower()
    if len(w) > 6:
        continue
    words = w.split()
    if len(words) > 1:
        continue
    # remove extra whitespace in the middle
    w = " ".join(words)  
    up = getattr(t, "up_votes")
    if w in black or up < 565:
        continue
    if w in keep and up <= keep[w][0]:
        continue
    keep[w] = (up, getattr(t, "definition"))
rows = []
for k, v in keep.items():
    rows.append({"word": k, "up": v[0], "definition": v[1]})

2580587it [00:04, 551831.97it/s]


In [9]:
df = pd.DataFrame.from_records(rows)
df.sort_values("word", ascending=True, inplace=True, ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3655 entries, 0 to 3654
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   word        3655 non-null   object
 1   up          3655 non-null   int64 
 2   definition  3653 non-null   object
dtypes: int64(1), object(2)
memory usage: 85.8+ KB


In [10]:
df.sample(40)

Unnamed: 0,word,up,definition
2056,midget,3135,"A tiny sub-species of the human race. Mainly raised in midget mills, for the entertainment of normal people. Midgets in the wild are attracted to sugar packets, and if captured they serve many purposes, such as, midget butlers, wrestlers, and as collectables."
3351,ursh,704,"A word used to express frustration, disappointment, or anger."
999,fbgm,839,"Fuck Bitches, Get Money. ;; A way of life. ;; Another way of saying Bros Before Hoes. ;; The word money is simply a metaphor for wealth: Fuck Bitches, Get Metaphor."
3492,womyn,3933,"This is a term used by feminists who feel that having the word 'man' in the word 'woman' makes women a subset of men. So, to make themselves a non subset, they changed the letter 'e' to a 'y'."
3157,tbag,626,To dip a hairy scrotum sack into another persons mouth.
3565,yankin,669,"A term used to refer to a vagina with multiple STDs. Often with visible blisters and/or warts, accompanied usually by a terrible stench. ;; Alternatives: yankin', yanking."
2316,numpty,1971,"Scottish usage: ;; a) Someone who (sometimes unwittingly) by speech or action demonstrates a lack of knowledge or misconception of a particular subject or situation to the amusement of others. ;; b) A good humoured admonition, a term of endearment ;; c) A reckless, absent minded or unwise person"
518,buzzed,1186,"Means that you have begun the process of entering an alcoholic beverage into your system. You are now ready to begin step two: drowning yourself in booze. After step two is complete you may feel very happy, and it might be a little hard to walk, you have now completed the process and you are officially drunk. Congratulations."
1744,keila,594,"Keila comes from Hebrew origins, (but also English, depending on spelling differences, and is mostly known to mean, 'Citadel', as in a stronghold or fortress in a city, but has also been related to 'Like God', as it is the Latin feminine form of Michael. ;; -Keila is a Godly woman, who loves God with all her heart. She is the most exquisitely beautiful, breathtaking, OUT OF THIS WORLD, angelic, drop dead gorgeous girl you know. However, Keila's beauty is only a plus to the fun-loving, intelligent, caring, generous, delightful, and vivacious personality that makes her SO incredible, and makes YOU go crazy, go out of your mind, literally skip a heartbeat. ;; -Keila is wholly delightful to be with; every second is mesmerizing. Nothing else matters. Life is refreshing again. You could die right then, and be the happiest, most content man in the world."
850,dork,20459,"Someone who has odd interests, and is often silly at times. A dork is also someone who can be themselves and not care what anyone thinks."


In [11]:
ws = {"brb", "afk", "lmfao", "wtf", "zomg", "snafu", "fubar", "aloof"}
df.loc[df["word"].isin(ws)]

Unnamed: 0,word,up,definition
122,afk,3819,Away From Keyboard.
462,brb,3476,Acronym for 'be right back'
1119,fubar,5141,F.U.B.A.R. is an acronym for: ;; 1. F*cked up beyond all reason. ;; 2. F*cked up beyond all recognition. ;; 3. F*cked up beyond all repair. ;; See [fuck] ;; All usage derived from the military.
1909,lmfao,6021,Laughing my fucking ass off
2967,snafu,3847,"One of a progression of military situational indicators: ;; 1. SNAFU - Situation Normal, All Fucked Up - Thing are running normally. ;; 2. TARFUN - Things Are Really Fucked Up Now - Houston, we have a problem. ;; 3. FUBAR - Fucked Up Beyond All Recognition - Burn it to the ground and start over from scratch; it's totally destroyed."
3509,wtf,9777,"The World Taekwondo Federation. ;; WTF, holds many competitions throughout the world."
3646,zomg,8691,"zOMG is a varient of the all-too-popular acronym 'OMG', meaning 'Oh My God'. ;; The 'z' was originally a mistake while attempting to hit the shift key with the left hand, and type 'OMG' ;; Also used in all-caps, 'ZOMG' is generally used in a sarcastic manner, more often than not a humiliating fasion. It is also used as a device for stating the obvious."


In [12]:
%%time
df.to_csv("output/urbandict.tsv", sep="\t", index=False)

Wall time: 22 ms
