# Import Data

In [None]:
import json

with open('urban_dict_data_cleaned_emo.json', 'r') as file:
    urban_dict_data = json.load(file)

# Train FastText Model

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m188.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313500 sha256=1ac4eeeea9498

In [None]:
import re
import fasttext

In [None]:
def preprocess_data(urban_dict):
  sentences = []
  for slang, info in urban_dict.items():
    for entry in info['top_5_entries']:
      sentences.append([slang] + re.findall(r'\b\w+\b', entry['definition'].lower()))
      sentences.append([slang] + re.findall(r'\b\w+\b', entry['example'].lower()))
  for i, sentence in enumerate(sentences):
    sentences[i] = ' '.join(sentence)
  return sentences

In [None]:
sentences = preprocess_data(urban_dict_data)
print(sentences[:3])

['word a versatile declaration originating more or less in hip hop culture word has no single meaning but is used to convey a casual sense of affirmation acknowledgement agreement or to indicate that something has impressed you favorably its usage among young blacks has been parodied ad nauseam among clueless suburban whites', 'word come on man we re going to the store word', 'word i concur my fellow african american friends']


In [None]:
def create_input_file(sentences):
  with open("ud_slang.txt", "w") as f:
    for sentence in sentences:
      f.write(sentence + "\n")

In [None]:
create_input_file(sentences)

In [None]:
def train_model():
  model = fasttext.train_unsupervised("ud_slang.txt", model='skipgram')
  model.save_model("urban_slang_ft.bin")

In [None]:
train_model()

In [None]:
def test_model():
  model = fasttext.load_model("urban_slang_ft.bin")

  print("Words similar to 'rizz':")
  print(model.get_nearest_neighbors("rizz"))

  print("\nWords similar to 'cap':")
  print(model.get_nearest_neighbors("cap"))

  print("\nWords similar to 'bet':")
  print(model.get_nearest_neighbors("bet"))

In [None]:
test_model()

Words similar to 'rizz':
[(0.8374569416046143, 'rizzal'), (0.8199120163917542, 'Zizz'), (0.7882004976272583, 'brizz'), (0.7881596088409424, 'drizz'), (0.772638201713562, 'rizzy'), (0.7692455649375916, 'izz'), (0.7655311822891235, 'rizzo'), (0.755256175994873, 'Fizz'), (0.7501357793807983, 'mizz'), (0.7500044703483582, 'grizz')]

Words similar to 'cap':
[(0.7343732118606567, 'capy'), (0.7339431643486023, 'capn'), (0.7028482556343079, "cap'n"), (0.6489825248718262, 'hat'), (0.6382067203521729, 'capon'), (0.6374916434288025, 'hubcap'), (0.6173983216285706, 'caps'), (0.6069614887237549, 'caper'), (0.603495180606842, 'cappy2'), (0.5878653526306152, 'cappys')]

Words similar to 'bet':
[(0.6458205580711365, 'cheyenne'), (0.6376032829284668, 'lotto'), (0.6364998817443848, 'dude3'), (0.6257866621017456, 'betsie'), (0.6172948479652405, 'guess'), (0.6113016605377197, 'you'), (0.6075536012649536, 'just'), (0.6053643822669983, 'i'), (0.605131208896637, 'pbf'), (0.6045577526092529, 'PBF')]


In [None]:
#model.most_similar(slang)

In [None]:
from google.colab import files
files.download('ud_slang.txt')
files.download('urban_slang_ft.bin')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>