In [1]:
!pip install python-Levenshtein



In [2]:
!pip install -U accelerate



### Import Libraries & Modules

In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import json
import re
import requests
import string
import matplotlib.pyplot as plt

import spacy
from textblob import TextBlob
tokens = spacy.load("en_core_web_sm")
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from Levenshtein import distance
from functools import reduce
from collections import Counter
from itertools import chain

import nltk
nltk.download("stopwords")
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import norm
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.metrics import edit_distance

import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /usr/share/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Using device: cuda


### Import Data

In [4]:
datapath = '/kaggle/input/final-data/final_data-2.csv'
data = pd.read_csv(datapath)
data = data.dropna(axis=0)
print(data.shape)
data.sample(10)

(38547, 2)


Unnamed: 0,Target,Clues
18459,nothing,inconsequential conversation
26145,yet,present time
13942,gloominess,virgil great disgust
37739,sympathy vengeance,year imprisonment old boy lee guem ja boy deat...
1925,unalienable,creator certain unalienable right
33816,vettel,bull sebastian vettel double world champion race
12201,woven,woven basket
20829,burn,dress damage
37394,godfather,span italian american corleone crime family cr...
10931,ingenuous,ingenuous explanation


### Import Wordnet Data

In [5]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [6]:
noun_lemmas_in_wordnet = set(chain(*[ss.lemma_names() for ss in wn.all_synsets(pos='n')]))

def get_word_relations(word):
    synsets = wn.synsets(word, pos='n')
    antonyms = set()
    hyponyms = set()
    synonyms = set()
    hypernyms = set()
    polysems = set()
    metonyms = set()
    part_meronyms = set()
    member_meronyms = set()
    substance_meronyms = set()
    part_holonyms = set()
    member_holonyms = set()
    substance_holonyms = set()
    troponyms = set()
    entails = set()

    for synset in synsets:
        antonyms.update(lemma.name().split('.')[0] for lemma in synset.lemmas() if lemma.antonyms())
        hyponyms.update(lemma.name().split('.')[0] for lemma in synset.lemmas() if lemma.hyponyms())
        synonyms.update(lemma.name().split('.')[0] for lemma in synset.lemmas())
        hypernyms.update(hypernym.name().split('.')[0] for hypernym in synset.hypernyms())
        polysems.update(syn.name().split('.')[0] for syn in wn.synsets(word))
        metonyms.update(related.name().split('.')[0] for related in synset.part_meronyms())
        part_meronyms.update(related.name().split('.')[0] for related in synset.part_meronyms())
        member_meronyms.update(related.name().split('.')[0] for related in synset.member_meronyms())
        substance_meronyms.update(related.name().split('.')[0] for related in synset.substance_meronyms())
        part_holonyms.update(related.name().split('.')[0] for related in synset.part_holonyms())
        member_holonyms.update(related.name().split('.')[0] for related in synset.member_holonyms())
        substance_holonyms.update(related.name().split('.')[0] for related in synset.substance_holonyms())
        troponyms.update(lemma.name().split('.')[0] for lemma in synset.lemmas() if lemma.derivationally_related_forms())
        entails.update(related.name().split('.')[0] for related in synset.entailments())

    return {
        'Target': word,
        'Clues': ', '.join(set(chain(member_meronyms, member_holonyms, part_meronyms, part_holonyms, hypernyms, hyponyms)))
    }

word_relations_list = [get_word_relations(word) for word in noun_lemmas_in_wordnet]
wordnet_words = pd.DataFrame(word_relations_list)
print(wordnet_words.shape)
wordnet_words.sample(10)

(119034, 2)


Unnamed: 0,Target,Clues
84122,conic,plane_figure
29106,Plethodon_vehiculum,"salamander, plethodon"
104475,Faulkner,
36612,decile,mark
76957,Pezophaps_solitaria,"pezophaps, columbiform_bird"
65962,barbwire,"barb, wire"
74967,sensationalism,"message, journalese, philosophical_doctrine"
116044,Levi,
76863,salvinorin,hallucinogen
104582,Neolentinus,"polyporaceae, fungus_genus"


In [7]:
data = pd.concat([data, wordnet_words], ignore_index=True)
data = data[data['Target'].apply(lambda x: len(x) > 0)]
data = data[data['Clues'].apply(lambda x: len(x) > 0)]
data = data.groupby('Target', as_index=False)['Clues'].agg(', '.join)
data.to_csv('final_data.csv')
print(data.shape)
data.sample(10)

(125075, 2)


Unnamed: 0,Target,Clues
39482,catamountain,"wildcat, felis"
18588,Pinus_banksiana,pine
106033,shortfall,insufficiency
46447,cross_section,"section, sample, probability"
96592,proprietress,owner
47704,dawn_redwood,"conifer, genus_metasequoia"
106411,signal_detection,reception
100724,reverberation,"market crash indirect consequence, reflection,..."
17394,Otaria_Byronia,"sea_lion, otaria"
119167,unexampled,previous example


### Word2Vec Model with Skip-Gram

In [8]:
def tokenizer(sentence):
    sentence = sentence.replace('_', ' ')
    return word_tokenize(sentence)

tokenized_data_target = [tokenizer(str(sentence)) for sentence in data['Target']]
tokenized_data_clues = [tokenizer(str(sentence)) for sentence in data['Clues']]
tokenized_data_combined = tokenized_data_target + tokenized_data_clues

w2v_model = Word2Vec(sentences=tokenized_data_combined, vector_size=100, window=5, min_count=1, sg=1, workers=4)

In [9]:
def common_words(words, top_n=10):
    similar_words = []
    similar_score = []
    for word in words:
        try:
            if word in w2v_model.wv:
                similar_words += [similar_word for similar_word, _ in w2v_model.wv.most_similar(word, topn=top_n)]
                similar_score += [similar_score for _, similar_score in w2v_model.wv.most_similar(word, topn=top_n)]
        except KeyError:
            pass

    data = pd.DataFrame({'Similar_Words': similar_words, 'Similar_Scores': similar_score})
    ranked_data = data.groupby("Similar_Words")["Similar_Scores"].mean()
    ranked_data = ranked_data.sort_values(ascending=False)

    return ranked_data

In [10]:
input_words = ['laptop', 'pc', 'smartphone']
common_words_result = common_words(input_words)
print(common_words_result)

Similar_Words
user              0.994676
random-access     0.994217
file              0.992042
read-only         0.990551
portable          0.989835
analog            0.989219
microprocessor    0.988497
engineer          0.988422
printed           0.987369
analogue          0.986432
microsoft         0.986010
oscillator        0.984504
diode             0.984464
steele            0.984211
kahn              0.983643
filename          0.983211
push-button       0.982999
nefarious         0.982888
semiautomatic     0.982814
equilateral       0.982773
cyrillic          0.979454
fuel-air          0.978120
instantaneous     0.977929
random            0.977753
ionic             0.977746
jockey            0.977550
methane           0.977197
equation          0.977061
iambic            0.976934
Name: Similar_Scores, dtype: float64


In [11]:
input_words = ['apple', 'banana', 'grape', 'mango']
common_words_result = common_words(input_words)
print(common_words_result)

Similar_Words
mangosteen       0.994756
simarouba        0.994584
incense          0.994515
medlar           0.994426
diospyros        0.994119
olea             0.994118
dovyalis         0.994090
marmalade        0.993838
malus            0.993831
elaeocarpus      0.993776
celery           0.991166
cotton           0.990943
tuber            0.990456
salad            0.990044
snail            0.989869
castor           0.989640
juniper          0.989112
peel             0.988642
mandrake         0.988631
indigofera       0.988569
flowering        0.985094
climbing         0.978654
plum             0.978408
kauri            0.978228
bamboo           0.977985
aphid            0.977871
prairie          0.977181
fragrant         0.977081
vinifera         0.976726
ceratopteris     0.976377
pteridaceae      0.976176
custard          0.975045
angiospermous    0.970106
rose             0.967844
nut              0.967737
fig              0.967681
quandong         0.966898
hickory          0.96496

In [12]:
input_words = ['spoon', 'fork', 'butterknife']
common_words_result = common_words(input_words)
print(common_words_result)

Similar_Words
hanger       0.992066
appendage    0.991877
beater       0.991868
lining       0.991714
vitreous     0.991585
can          0.991398
sharpener    0.991372
baked        0.991286
cookie       0.991248
cutlery      0.991008
cheese       0.990893
wiper        0.990877
log          0.990710
frozen       0.990496
bubble       0.990330
boil         0.990290
stain        0.989989
carrycot     0.989599
sore         0.989501
aphasia      0.989486
Name: Similar_Scores, dtype: float64
