# Domain Name Curator

The natural language component of <i>IntelliSearch</i>, which uses both the spaCY and NLTK libraries to help sort through the millions of domain names produced by the previous two components and come up with a word association network that connects all the names by assigning association strength values between them (more on this later).

This section can be further broken down into 3 subsections:
- Using NLTK POS tagger to classify existing words into specific categories
- Building a word association network with spaCY
- Using the NLTK library to find synonyms for a given word

## Using NLTK POS tagger to classify existing words into specific categories


In [None]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 11.0 MB/s 
[?25hCollecting click
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 2.5 MB/s 
Collecting regex
  Downloading regex-2020.11.13-cp37-cp37m-manylinux2014_x86_64.whl (719 kB)
[K     |████████████████████████████████| 719 kB 40.1 MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434676 sha256=c37c2abc74da890133af6503e9767bc183ec1994c8cd59b7520ddb60eb521171
  Stored in directory: /home/jovyan/.cache/pip/wheels/45/6c/46/a1865e7ba706b3817f5d1b2ff7ce8996aabdd0d03d47ba0266
Successfully built nltk
Installing collected packages: click, regex, nltk
Successfully installed click-7.1.2 nltk-3.5 regex-2020.11.13
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.

In [None]:
import nltk
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
wordtags = nltk.ConditionalFreqDist((w.lower(), t) for w, t in nltk.corpus.brown.tagged_words(tagset="universal"))

In [None]:
def getCategory(word):
    categories = list(wordtags[word])
    category_map = {
        'NOUN': 'nouns',
        'VERB': 'verbs',
        'ADJ': 'adjectives',
        'ADV': 'adverbs',
        'ADP': 'adpositions',
        'PRON': 'pronouns',
        'CONJ': 'conjunctinos',
        'DET': 'determiners',
        'NUM': 'numbers',
        'PRT': 'particles',
        'X': 'other'
    }
    if len(categories) == 0:
        return ['other']
    
    normalized_cats = []
    for cat in categories:
        normalized_cats.append(category_map[cat])
    return normalized_cats

In [None]:
words = ['apple', 'apply', 'beautiful', 'funny', 'joke', 'happy']

for word in words:
    print(word, getCategory(word))

apple ['nouns']
apply ['verbs']
beautiful ['adjectives']
funny ['adjectives', 'adverbs']
joke ['nouns', 'verbs']
happy ['adjectives', 'nouns']


## Building a word association network with spaCY

In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg

You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
import csv
import spacy
import en_core_web_lg

nlp = en_core_web_lg.load()

In [None]:
tokens = nlp('great steve fantastic watson') 
  
for token in tokens: 
    '''
    Attributes
    - text: the word string, 
    - has_vector: if it contains a vector representation in the model,  
    - vector_norm: the algebraic norm of the vector, 
    - is_oov: if the word is out of vocabulary. 
    '''
    print(token.text, token.has_vector, token.vector_norm, token.is_oov) 

token1, token2, token3, token4 = tokens[0], tokens[1], tokens[2], tokens[3]
  
print("Similarity:", token1, token2, token1.similarity(token2)) 
print("Similarity:", token1, token3, token1.similarity(token3)) 
print("Similarity:", token1, token4, token1.similarity(token4)) 
print("Similarity:", token2, token3, token2.similarity(token3)) 
print("Similarity:", token2, token4, token2.similarity(token4)) 
print("Similarity:", token3, token4, token3.similarity(token4)) 

great True 5.4395933 False
steve True 6.181552 False
fantastic True 5.561246 False
watson True 6.6602826 False
Similarity: great steve 0.21640417
Similarity: great fantastic 0.81248736
Similarity: great watson 0.07907305
Similarity: steve fantastic 0.16961657
Similarity: steve watson 0.4813081
Similarity: fantastic watson 0.05662866


In [None]:
category_list = ["adjectives", "battleships", "nouns", "verbs", "predicates", "positive", "tech", "places", "names", "gods", "stars", "collections"]

word_list = {}

for category in category_list:
    
    txt_file = "datasets/word-files/"+category+".txt"
    
    with open(txt_file, 'r+') as f:
        for word in f.read().splitlines():
            if category == 'other':
                categories = getCategory(word)
                categories.append(category)
                categories = list(dict.fromkeys(categories))
                word_list[word.lower().strip()] = {'category': categories, 'similarity': []}
            else:
                try:
                    word_list[word.lower().strip()]['category'].append(category)
                except:
                    word_list[word.lower().strip()] = {'category': [category], 'similarity': []}


In [None]:
# getting all the keys of the dictionary
all_words = [word for word in word_list.keys()]

# some words get split for no reason, gotta remove them from list
count = 0
splitted_words = []
for idx, word in enumerate(all_words):
    if word != str(tokens[idx+count]):
        splitted_words.append(word)
        count += 1
print(count, splitted_words)
for word in splitted_words:
    all_words.remove(word)

# tokenizing all the words
tokens = nlp(' '.join(all_words))

10 ['so-called', 'optional', 'cant', 'long-term', 'kidney', 'wed', 'gonna', "don't", 'gotta', "won't"]


In [None]:
# Makeshift priority queue function that would take the newest tuple, 
# compare it to the existing list of tuples, and if it's bigger than the smallest
# tuple in the list, bump that tuple out and append this new tuple to the list
# Then, sort the list and return

def addToList(ele, lst, num_ele):
    if ele in lst:
        return lst
    if len(lst) >= num_ele: #if list is at capacity
        if ele[1] > float(lst[-1][1]): #if element's sig_score is larger than smallest sig_score in list
            lst.pop(-1)
            lst.append((ele[0], str(ele[1])))
            lst.sort(key = lambda x: float(x[1]), reverse=True)
    else:
        lst.append((ele[0], str(ele[1])))
        lst.sort(key = lambda x: float(x[1]), reverse=True)
    return lst

In [None]:
# Nested for loop to tokenize all words
# Note: This takes quite a long time to run (~8 hours, if not more)

for i in range(len(all_words)):
    for j in range(i, len(all_words)):
        prev_list_i = word_list[str(tokens[i])]['similarity']
        word_list[str(tokens[i])]['similarity'] = addToList((str(tokens[j]), tokens[i].similarity(tokens[j])), prev_list_i, 100)
        prev_list_j = word_list[str(tokens[j])]['similarity']
        word_list[str(tokens[j])]['similarity'] = addToList((str(tokens[i]), tokens[i].similarity(tokens[j])), prev_list_j, 100)
        
    print('Done with ', all_words[i])

## Using the NLTK library to find synonyms for a given word

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.corpus import wordnet 
  
def find_synonyms(word):
    synonyms = [] 
    forbidden_chars = ['0','1','2','3','4','5','6','7','8','9',' ',':','(',')', '-', '#', '_']

    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            if all(char not in l.name() for char in forbidden_chars) and l.name() not in synonyms:
                synonyms.append(l.name().lower()) 
    
    if len(synonyms) == 1 and synonyms[0].lower() == word:
        return []
    return synonyms

In [None]:
find_synonyms('funny')

['funny',
 'amusing',
 'comic',
 'comical',
 'laughable',
 'mirthful',
 'risible',
 'curious',
 'odd',
 'peculiar',
 'queer',
 'rum',
 'rummy',
 'singular',
 'fishy',
 'shady',
 'suspect',
 'suspicious']