In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313504 sha256=048f38f2e6c05092e90e68dd2592389169377f8c82c8883c617b191648cd7970
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [None]:
import fasttext
import numpy as np
from sklearn.decomposition import PCA
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.layouts import column

In [None]:
def run_pca(slang):
  model = fasttext.load_model("urban_slang_ft.bin")
  word_vectors = np.array([model.get_word_vector(word) for word in slang])

  pca = PCA(n_components=2)
  pca_result = pca.fit_transform(word_vectors)

  return pca_result, word_vectors, model

In [None]:
def scatter_pca(pca, slang, n=None):
  if n is not None:
      indices = np.random.choice(len(pca), n, replace=False)
      pca = pca[indices]
      slang = np.array(slang)[indices]

  source = ColumnDataSource(data=dict(
    x=pca[:, 0],
    y=pca[:, 1],
    slang=slang
  ))

  p = figure(title="PCA of FastText Word Embeddings",
           x_axis_label='Principal Component 1',
           y_axis_label='Principal Component 2',
           tools="pan,box_zoom,reset,hover")

  # Add scatter points
  p.scatter(x='x', y='y', source=source, size=10, alpha=0.6)

  # Add hover tool
  p.add_tools(
    HoverTool(
        tooltips=[("Slang", "@slang")]
    )
  )

  output_notebook()
  show(p)

In [None]:
import json

with open('urban_dict_data_cleaned_emo.json', 'r') as file:
    urban_dict_data = json.load(file)

slang = []
for key in urban_dict_data:
  slang.append(key)

In [None]:
pca, word_embeddings, model = run_pca(slang)

In [None]:
model.get_nearest_neighbors("lit")

[(0.7863661050796509, 'litre'),
 (0.7495236396789551, 'litso'),
 (0.7242756485939026, 'Blitz'),
 (0.7012696862220764, 'blit'),
 (0.6965886950492859, 'lito'),
 (0.694945752620697, 'Blitzed'),
 (0.686968982219696, 'blitzed'),
 (0.6517820954322815, 'shlitz'),
 (0.6493445634841919, 'spoofed'),
 (0.6453139781951904, 'lithe')]

In [None]:
import heapq
def nearest(slang, model):
  top_10 = []
  for s in slang:
    near = model.get_nearest_neighbors(s)
    for score, w2 in near:
      top_10.append((s, w2, score))

  top_10.sort(key=lambda x: x[2], reverse=True)
  return top_10[:10]

In [None]:
nearest(slang, model)

In [None]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
import numpy as np
from bokeh.io import export_png

In [None]:
scatter_pca(pca, slang)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Find the top N most similar word pairs (excluding self-similarity).
def similarity(word_embeddings, slang, top_n=10):
  # Compute cosine similarity matrix
  cos_sim_matrix = cosine_similarity(word_embeddings)

  # Find 10 most similar words
  pairs = []
  n = len(slang)

  # Loop through upper triangle of the matrix (excluding diagonal)
  for i in range(n):
      for j in range(i + 1, n):
          similarity = cos_sim_matrix[i][j]
          pairs.append((slang[i], slang[j], similarity))

  # Sort by similarity descending
  pairs.sort(key=lambda x: x[2], reverse=True)

  return [(w1, w2, round(score, 4)) for w1, w2, score in pairs[:top_n]]

In [None]:
import torch
import heapq

In [None]:
def calculate_similarity(word_embeddings, slang, top_k=10, batch_size=1000):
    # Convert embeddings to PyTorch tensor and move to GPU
    embeddings = torch.tensor(word_embeddings, dtype=torch.float32).cuda()

    # Normalize embeddings
    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

    n = len(slang)
    results = []

    # Process in batches
    for i in range(0, n, batch_size):
        # Get the current batch
        batch_embeddings = embeddings[i:i+batch_size]

        # Compute cosine similarities between the batch and all embeddings
        similarity_matrix = batch_embeddings @ embeddings.T  # shape: (batch_size, n)

        # Mask self-similarity
        similarity_matrix.fill_diagonal_(-1.0)

        similarity_matrix = torch.clamp(similarity_matrix, min=-1.0, max=1.0)

        # Get top-k values for each batch
        topk_vals, topk_indices = torch.topk(similarity_matrix.view(-1), top_k)

        # Extract results
        for idx in range(top_k):
            flat_idx = topk_indices[idx].item()
            row = flat_idx // n
            col = flat_idx % n
            results.append((slang[row], slang[col], topk_vals[idx].item()))

    # Sort the results by similarity in descending order
    results.sort(key=lambda x: x[2], reverse=True)
    return results[:top_k]

In [None]:
similarities = calculate_similarity(word_embeddings, slang, top_k=20)

In [None]:
import pandas as pd
df_similarities = pd.DataFrame(similarities, columns=["Word 1", "Word 2", "Cosine Similarity"])

In [None]:
print(df_similarities)

          Word 1             Word 2  Cosine Similarity
0    more better          mo better                1.0
1      man whore           korwhore                1.0
2       'bout it                tap                1.0
3        rim job         hojillions                1.0
4       Goat Ass  dell holiday n00b                1.0
5            AIM           SHNIEZER                1.0
6      hard-core          cockfight                1.0
7           word               Lies                1.0
8        massive       Pussy Patrol                1.0
9           Hazy        Oscar Mayer                1.0
10       No Shit             snatch                1.0
11      Dry hump                net                1.0
12        A-hole           Midtowns                1.0
13  cluster fuck        grow a tail                1.0
14        Rental              Frope                1.0
15    Toss salid   Bathtub in Jello                1.0
16      cassette               chub                1.0
17        

In [None]:
def calculate_dissimilarity(word_embeddings, slang, bottom_k=10, batch_size=1000):
    # Convert embeddings to PyTorch tensor and move to GPU
    embeddings = torch.tensor(word_embeddings, dtype=torch.float32).cuda()

    # Normalize embeddings
    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

    n = len(slang)
    results = []

    # Process in batches
    for i in range(0, n, batch_size):
        # Get the current batch
        batch_embeddings = embeddings[i:i+batch_size]

        # Compute cosine similarities between the batch and all embeddings
        similarity_matrix = batch_embeddings @ embeddings.T  # shape: (batch_size, n)

        # Mask self-similarity
        similarity_matrix.fill_diagonal_(float('inf'))

        similarity_matrix = torch.clamp(similarity_matrix, min=-1.0, max=1.0)

        # Get bottom-k values for each batch
        bottomk_vals, bottomk_indices = torch.topk(similarity_matrix.view(-1), bottom_k, largest=False)

        # Extract results
        for idx in range(bottom_k):
            flat_idx = bottomk_indices[idx].item()
            row = flat_idx // n
            col = flat_idx % n
            results.append((slang[row], slang[col], (bottomk_vals[idx].item())))

    filtered_results = [
      (w1, w2, similarity)
      for w1, w2, similarity in results
      if w1.isalpha() and w2.isalpha()
    ]

    # Sort the results by similarity in ascending order
    filtered_results.sort(key=lambda x: x[2])
    return filtered_results[:bottom_k]

In [None]:
dissimilarities = calculate_dissimilarity(word_embeddings, slang, bottom_k=20)

In [None]:
df_dissimilarities = pd.DataFrame(dissimilarities, columns=["Word 1", "Word 2", "Cosine Similarity"])

In [None]:
print(df_dissimilarities)

     Word 1                Word 2  Cosine Similarity
0       dhj                 ridlo          -0.726956
1        AM                  ICBM          -0.721447
2      dong                    BS          -0.714703
3       bah  sternoplydomastoidus          -0.714703
4     queaf                 ridlo          -0.710698
5     bones                    BS          -0.708681
6       bah             gastamate          -0.708044
7       POS                 ridlo          -0.705025
8     pants                    BS          -0.702459
9       bah           CRUNCHBLORT          -0.702459
10  classic                    cy          -0.701574
11     veto                    BS          -0.700147
12  backend                 ridlo          -0.699671
13      bah       McFrostyofkiriA          -0.697927
14     gank                    BS          -0.695733
15     bufu                    cy          -0.694465
16       AM                   dkv          -0.694465
17      bah                 GOUNI          -0.

In [None]:
df_similarities.to_csv("similarities.csv", index=False)
df_dissimilarities.to_csv("dissimilarities.csv", index=False)