In [27]:
import pandas as pd
import numpy as np
import gensim
import string
import os
import random
import plotly.express as px
from sklearn.manifold import TSNE

In [None]:
# Initially planned to use FastText pre-trained word embedding model, but did not in the end. 
# The below is the loading of the pre-trained model. 

import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for i,line in enumerate(fin):
        if i > 10000:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data
    
vectors = load_vectors("./crawl-300d-2M-subword/crawl-300d-2M-subword.vec")

In [13]:
# Formatting the urban dictionary dataset, changing it from csv to a format that can be read by Gensim. 

urban_df = []
for root, dirs, files in os.walk(".", topdown=False):
    for f in files:
        if "urban" in f:
            df = pd.read_csv(f"{root}/{f}",header=None)
            urban_df.append(df)
urban_df = pd.concat(urban_df)

In [14]:
urban_df = urban_df.sort_values(0)
urban_df = urban_df.reset_index(drop=True)
urban_df

Unnamed: 0,0,1,2,3,4,5
0,A,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,allerd,Allerd is a man who is sexy with a hot bro...,Allerd is an amazing human being .
1,A,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Anti Fre,Archaic British-English english slang (Circa ...,Howard: I've just inserted a squirrel up my a...
2,A,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Anti Friend,When one of your friend is being a jerk and ...,""" Little Jimmy is always an Anti Friend.""\r W..."
3,A,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Anti Fur-Furry Peace Agreement Day,"On December 21st , participants of Anti-Fur-...",Rainb0wF0x: Its Anti Fur-Furry Peace Agreement...
4,A,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Anti furry day,Anti Furry day is on 2/ 21 . You are allowed ...,The furry was mad because it was anti fur...
...,...,...,...,...,...,...
2281492,Z,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Zin zin,Something you say whenever you feel like re...,"Joe offers Rob a line, Rob replys "" zin zin """
2281493,Z,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Zin Xhao,"La persona mas facherita en el universo , su ...",Tipo 1: Te pajeaste pensando en Zin Xhao ?\r ...
2281494,Z,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,zin chick,a woman who hangs out in a bar drinking nothin...,...see that pink stuff in her glass ? She...
2281495,Z,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,zinadeen,"zinadeen is a little fellow, many say he often...","‚Äú man went out with zinadeen last night, bare..."


In [17]:
# Cleaning up the data: removing terms without example sentences, remove all punctuation in example sentences. 

word_df = urban_df[~urban_df[3].isna()]
word_df = word_df[word_df[3].apply(lambda x: True if len(x.split())==1 else False)]
word_df = word_df[~word_df[5].isna()]
word_df = word_df[word_df[3].apply(has_punc)]
# word_df.to_csv("filter.csv",index=False)
urban_words = word_df[3].tolist()
word_df[5] = word_df[5].apply(lambda x: x.replace("\n"," "))
word_df[5] = word_df[5].apply(lambda x:x.translate(str.maketrans('', '', string.punctuation)))
word_df.sample(20)

Unnamed: 0,0,1,2,3,4,5
1516542,P,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Pregive,To give someone something that you were sup...,Im going to pregive him an xbox for his birt...
1525204,P,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,prostage,A prostitute who is your hostage .,This boot would be big enough to fit a p...
2072219,T,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Tshitzami,(Shitz'on'me)\r Massive Shit wave sometimes...,I got hit by a Tshitzami today at work \r Eq...
1292155,M,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,masil,mas¬∑il (mey-zuhl)\r Originating in Prince E...,Mike Hey Dave want to swing by the restau...
1546761,P,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Pattybear,A cute guy that looks gay but isn't. He resem...,Im not sure if thats a Pattybear or a gay ...
1315716,M,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,MPIB,My piano is brown .,Man My Piano is brown\r Woman moans \r Man ...
1361949,M,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Moktadir,A person who is very attractive a person who c...,You are as hot as Moktadir
1213643,L,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Lathiefa,A stupid girl named Lathiefa,You know Lathiefa I dont like her
730287,F,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,fauxhip,The act of rocking out to trendy music while ...,Kyle was fauxhipping to the Yeah Yeah Yeahs...
1864805,S,https://www.urbandictionary.com/browse.php?cha...,https://www.urbandictionary.com/define.php?ter...,Stripostitute,"Derived from the word Stripper and Prostitute,...",Wow did you see how that stripostitute work...


In [18]:
word_df.to_csv("filtered.csv",index=False)

In [22]:
# Importing Word2Vec algorithm from Gensim to train new model. 

from gensim.test.utils import common_texts
from gensim.models import Word2Vec

sentences = [i.split() for i in urban_df[~urban_df[5].isna()][5].tolist()]
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec2.model")

In [23]:
# Creating training set for model 

word_corpus = [j for i in sentences for j in i]
counter = {}
for w in word_corpus:
    counter[w.lower()] = counter.get(w.lower(),0) + 1
counter = pd.Series(counter).sort_values(ascending=False)

In [25]:
# most_common = []
# n = 0
# for i in counter.tolist():
#     if i in urban_words:
#         most_common.append(i)
#         n += 1
#     if n > 100:
#         break

In [None]:
model = Word2Vec.load("word2vec2.model")

In [28]:
# Take a sample of 1000 words from the dataset and plot in word embedding model using TSNE. 

sample_size = 1000
sample_words = random.sample(urban_words,sample_size)
found_words = [i for i in sample_words if i in model.wv.index_to_key]
arr = model.wv[found_words]
tsne = TSNE(2)
t_arr = tsne.fit_transform(arr)
plot_df = pd.DataFrame(t_arr)
plot_df["words"] = found_words
fig = px.scatter(plot_df,x=0, y=1,hover_data= ["words"])

fig.write_html("sample_2d.html")
fig.show()



In [30]:
# Take a sample of 100 words from the dataset, and for each word, list top 10 associations; visualise all these in scatterplot. 
# TSNE was used to reduce the dimensions of the word vectors for plotting. 

sample_size = 100
sample_words = random.sample(urban_words,sample_size)
found_words = [i for i in sample_words if i in model.wv.index_to_key]
others = []
for w in found_words:
    sim_words = model.wv.most_similar(w,topn=10)
    for sw in sim_words:
        others.append(sw[0])
all_words = others + found_words
sim_words_list = []
for w in all_words:
    sim_words = model.wv.most_similar(w,topn=10)
    sim_words_list.append([i[0] for i in sim_words])
arr = model.wv[all_words]

tsne = TSNE(2)
t_arr = tsne.fit_transform(arr)
plot_df = pd.DataFrame(t_arr)
plot_df["sim_words"] = sim_words_list
plot_df["words"] = all_words
fig = px.scatter(plot_df,x=0, y=1,hover_data= ["words","sim_words"])
fig.update_layout(
    autosize=False,
    width=1000,
    height=1000,
)
fig.write_html("sample_sim.html")
fig.show()



In [31]:
sample_size = 100
sample_words = random.sample(words,sample_size)
found_words = [i for i in sample_words if i in model.wv.index_to_key]
others = []
for w in found_words:
    sim_words = model.wv.most_similar(w,topn=10)
    for sw in sim_words:
        others.append(sw[0])
all_words = others + found_words
sim_words_list = []
for w in all_words:
    sim_words = model.wv.most_similar(w,topn=10)
    sim_words_list.append([i[0] for i in sim_words])
arr = model.wv[all_words]

tsne = TSNE(2)
t_arr = tsne.fit_transform(arr)
plot_df = pd.DataFrame(t_arr)
plot_df["sim_words"] = sim_words_list
plot_df["words"] = all_words
fig = px.scatter(plot_df,x=0, y=1,hover_data= ["words","sim_words"])
fig.update_layout(
    autosize=False,
    width=1000,
    height=1000,
)
fig.write_html("fixed-sim.html")
fig.show()



NameError: name 'words' is not defined

In [None]:
model.wv.most_similar('eboy', topn=10)


[('introvert', 0.9008392691612244),
 ('oxymoron', 0.8826045393943787),
 ('incacole', 0.8730303645133972),
 ('incel', 0.86729896068573),
 ('insult,', 0.864527702331543),
 ('outsider', 0.8616935610771179),
 ('enigma', 0.8607584834098816),
 ('STI', 0.8584597110748291),
 ('elephant.', 0.8579079508781433),
 ('inveterate', 0.8575772643089294)]

In [None]:
model.wv.most_similar('cancel', topn=10)

[('babysit', 0.7563366889953613),
 ('renew', 0.752668559551239),
 ('attend', 0.7374246716499329),
 ('convert', 0.7119854092597961),
 ('restart', 0.7095818519592285),
 ('permit', 0.7087141275405884),
 ('ruin', 0.705554187297821),
 ('arrange', 0.7024335861206055),
 ('reschedule', 0.6985873579978943),
 ('repay', 0.6929827928543091)]