In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import names
from tqdm import tqdm

In [2]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('names')
nltk.data.path

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


['C:\\Users\\USER/nltk_data',
 'S:\\dev\\seahrh\\nlp-examples\\env\\nltk_data',
 'S:\\dev\\seahrh\\nlp-examples\\env\\share\\nltk_data',
 'S:\\dev\\seahrh\\nlp-examples\\env\\lib\\nltk_data',
 'C:\\Users\\USER\\AppData\\Roaming\\nltk_data',
 'C:\\nltk_data',
 'D:\\nltk_data',
 'E:\\nltk_data']

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
df = pd.read_csv("input/formal_en.tsv", sep="\t", header=0, engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186741 entries, 0 to 186740
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Word    186739 non-null  object
 1   Count   186741 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.8+ MB


In [5]:
_names = set(w.lower() for w in names.words())
formal = set()
for t in df.itertuples():
    w = getattr(t, "Word")
    c = getattr(t, "Count")
    if c >= 10:
        formal.add(w)
print(f"len(formal)={len(formal):,}\nlen(names)={len(_names)}")

len(formal)=14,090
len(names)=7576


In [6]:
df = pd.read_csv("input/urbandict-word-defs.csv", on_bad_lines="warn", engine="c", low_memory=False)
df.info()

b'Skipping line 7197: expected 6 fields, saw 7\nSkipping line 9757: expected 6 fields, saw 7\nSkipping line 13349: expected 6 fields, saw 7\nSkipping line 19999: expected 6 fields, saw 7\nSkipping line 20087: expected 6 fields, saw 7\nSkipping line 21775: expected 6 fields, saw 8\nSkipping line 23825: expected 6 fields, saw 8\nSkipping line 25254: expected 6 fields, saw 7\nSkipping line 25642: expected 6 fields, saw 7\nSkipping line 25776: expected 6 fields, saw 7\nSkipping line 30964: expected 6 fields, saw 7\nSkipping line 35484: expected 6 fields, saw 7\nSkipping line 36021: expected 6 fields, saw 8\nSkipping line 36071: expected 6 fields, saw 7\nSkipping line 40151: expected 6 fields, saw 7\nSkipping line 40694: expected 6 fields, saw 7\nSkipping line 41941: expected 6 fields, saw 7\nSkipping line 43659: expected 6 fields, saw 7\nSkipping line 46528: expected 6 fields, saw 7\nSkipping line 48481: expected 6 fields, saw 7\nSkipping line 49276: expected 6 fields, saw 7\nSkipping line

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580587 entries, 0 to 2580586
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   word_id     int64 
 1   word        object
 2   up_votes    int64 
 3   down_votes  int64 
 4   author      object
 5   definition  object
dtypes: int64(3), object(3)
memory usage: 118.1+ MB


In [7]:
cols = ["up_votes", "down_votes"]
df[cols].describe(percentiles=percentiles)

Unnamed: 0,up_votes,down_votes
count,2580587.0,2580587.0
mean,65.18388,42.10474
std,563.2683,281.7101
min,-42.0,-29.0
1%,0.0,0.0
5%,0.0,0.0
10%,1.0,0.0
20%,2.0,1.0
30%,4.0,2.0
40%,5.0,3.0


In [8]:
keep = {}
for t in tqdm(df.itertuples()):
    w = str(getattr(t, "word")).strip().lower()
    if len(w) > 6:
        continue
    words = w.split()
    if len(words) > 1:
        continue
    # remove extra whitespace in the middle
    w = " ".join(words)  
    up = getattr(t, "up_votes")
    if w in formal or w in _names or up < 565:
        continue
    if w in keep and up <= keep[w][0]:
        continue
    keep[w] = (up, getattr(t, "definition"))
rows = []
for k, v in keep.items():
    rows.append({"word": k, "up": v[0], "definition": v[1]})

2580587it [00:04, 557902.32it/s]


In [9]:
df = pd.DataFrame.from_records(rows)
df.sort_values("up", ascending=False, inplace=True, ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4428 entries, 0 to 4427
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   word        4428 non-null   object
 1   up          4428 non-null   int64 
 2   definition  4426 non-null   object
dtypes: int64(1), object(2)
memory usage: 103.9+ KB


In [10]:
df.sample(40)

Unnamed: 0,word,up,definition
1583,banksy,1369,a sexual maneuver where you film your partner on CCTV while wearing a monkey mask and decrying the police state
3770,ncis,652,"Best T.V. show out there. Can be a little corny at times but it is not your usual dry, boring, no personality investigative show. I would advise everyone to invest some time in watching this show."
3080,hax0r,773,"An inferior breed of sub-humans that download exploitatcious programs for use in computer games to supplement their bad genetics and lack of skill. As a result of such actions, they gain self-esteem from their apparent success, even though they had nothing to do with it. ;; A person who steals megahertz."
1841,ydg,1194,"Updated Definition; ;; The origins of YDG have been discovered to be from Philip Manansala, guitarist of Of Mice & Men. ;; YDG means Yadig!?"
2437,telly,930,its the slang word for hotel
3181,rancid,750,"Wicked punk rock band with an amazing sound. Featuring Modern Punk Rock Legend Lars Frederikson. Authors of such songs as: Ruby Soho, Time Bomb."
1827,sweg,1200,"instead of using 'Swag', you use 'Sweg' to be more hipster/ indie."
3145,v-card,758,Term used to define one's virginity. All virgins have a 'V-Card' until they 'cash it in' for sex.
3596,haji,678,"Any iraqi or afghani person (really, anybody in the middle east) that are seen by US soldiers."
850,ganic,2379,Short for organic meaning better than fresh. Also means extremely cool or dope.


In [11]:
ws = {"lmfao", "wtf", "zomg", "snafu", "fubar"}
df.loc[df["word"].isin(ws)]

Unnamed: 0,word,up,definition
127,wtf,9777,"The World Taekwondo Federation. ;; WTF, holds many competitions throughout the world."
158,zomg,8691,"zOMG is a varient of the all-too-popular acronym 'OMG', meaning 'Oh My God'. ;; The 'z' was originally a mistake while attempting to hit the shift key with the left hand, and type 'OMG' ;; Also used in all-caps, 'ZOMG' is generally used in a sarcastic manner, more often than not a humiliating fasion. It is also used as a device for stating the obvious."
279,lmfao,6021,Laughing my fucking ass off
341,fubar,5141,F.U.B.A.R. is an acronym for: ;; 1. F*cked up beyond all reason. ;; 2. F*cked up beyond all recognition. ;; 3. F*cked up beyond all repair. ;; See [fuck] ;; All usage derived from the military.
482,snafu,3847,"One of a progression of military situational indicators: ;; 1. SNAFU - Situation Normal, All Fucked Up - Thing are running normally. ;; 2. TARFUN - Things Are Really Fucked Up Now - Houston, we have a problem. ;; 3. FUBAR - Fucked Up Beyond All Recognition - Burn it to the ground and start over from scratch; it's totally destroyed."


In [12]:
%%time
df.to_csv("output/urbandict.tsv", sep="\t", index=False)

Wall time: 26 ms
