In [1]:
# Import the essential packages
# NLTK
# SpaCy
# Scikit-learn

!pip install -q nltk
!pip install -q spacy
!pip install -q scikit-learn # Not used yet

import nltk
import spacy
import sklearn

print("Import done!")

ERROR: Invalid requirement: '#'


Import done!


In [72]:
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ew464\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [3]:
import utils
nltk.download("punkt", quiet = True)

print("Imported utils")

Imported utils


In [4]:
# Download Spacy English model
!python -m spacy download en_core_web_lg > NUL 2>&1

print("Spacy English installed")

Spacy English installed


In [5]:
from nltk.corpus import twitter_samples

nltk.download("twitter_samples")

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\ew464\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [6]:
import re

In [7]:
def GetProperNouns(doc):
    properNouns = []
    for token in doc:
        if token.pos_ == "PROPN":
            properNouns.append(token.text)
    return properNouns

In [8]:
# Returns a list of named entities 
def GetNamedEntities(doc):
    entities = []
    for ent in doc.ents:
        entities.append(ent.text)
    return entities

In [9]:
def RemoveNamedEntities(doc, namedEntities):
    document = doc
    for namedEntity in namedEntities:
        if namedEntity in doc:
            document = document.replace(namedEntity, "")
    return document

In [10]:

# The document as a string
docstring = "Epic Games Fornite skibidi toilet anita max wynn awefawefawfwf. The game's diverse world captivates players worldwide."

# Load Spacy large English model and set up Spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(docstring)

# Get proper nouns and named entities as lists
properNouns = GetProperNouns(doc)
namedEntities = GetNamedEntities(doc)
print(properNouns)
print(namedEntities)

# Remove named entities from the document
doc_noEntities = RemoveNamedEntities(docstring, namedEntities)

# Normalize the document
doc_canonized = utils.canonize(doc_noEntities)

doc_canonized

['Epic', 'Games', 'Fornite', 'skibidi', 'anita', 'max', 'wynn', 'awefawefawfwf']
['Epic Games', 'Fornite', 'wynn awefawefawfwf']


['skibidi',
 'toilet',
 'anita',
 'max',
 'The',
 'game',
 "'s",
 'diverse',
 'world',
 'captivates',
 'players',
 'worldwide']

In [11]:
FreqDist(doc_canonized)

FreqDist({'skibidi': 1, 'toilet': 1, 'anita': 1, 'max': 1, 'The': 1, 'game': 1, "'s": 1, 'diverse': 1, 'world': 1, 'captivates': 1, ...})

In [12]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
tweets = positive_tweets + negative_tweets

In [13]:
def remove_twitter_handles(tweet):
    mention_pattern = r'@[A-Za-z0-9_]+'

    # Replace all matches with an empty string
    cleaned_text = re.sub(mention_pattern, '', tweet)
    
    return cleaned_text

In [14]:
def remove_twitter_hashtags(tweet):
    hashtag_pattern = r'#[A-Za-z0-9_]+'

    cleaned_text = re.sub(hashtag_pattern, '', tweet)
    
    return cleaned_text

In [15]:
def remove_links(text):
    doc = nlp(text)
    cleaned_text = ' '.join(token.text for token in doc if not token.like_url)
    return cleaned_text

In [16]:
def normalize_tweets(tweets):
    normalized_tweets = []

    for tweet in tweets:
        tweet = remove_twitter_handles(tweet)
        tweet = remove_twitter_hashtags(tweet)
        tweet = remove_links(tweet)
        tweet = utils.canonize(tweet)
        normalized_tweets.append(tweet)
    return normalized_tweets

normalized_tweets = normalize_tweets(tweets)

In [17]:
# Returns the token count of the whole corpora as a single FreqDist
# Input: A list of lists
# Output: A single FreqDist

def get_corpus_FreqDist(corpus):
    corpus_FreqDist = FreqDist()
    for document in corpus:
        corpus_FreqDist += FreqDist(document)
    
    return corpus_FreqDist

get_corpus_FreqDist(normalized_tweets)


FreqDist({'I': 3209, 'NUM': 942, "'s": 873, "n't": 839, 'D': 674, "'m": 523, '``': 478, '...': 462, 'like': 403, 'u': 391, ...})

In [18]:
normalized_tweets

[['top', 'engaged', 'members', 'community', 'week'],
 ['Hey',
  'James',
  'How',
  'odd',
  'Please',
  'call',
  'Contact',
  'Centre',
  'NUM',
  'able',
  'assist',
  'Many',
  'thanks'],
 ['listen',
  'last',
  'night',
  'As',
  'You',
  'Bleed',
  'amazing',
  'track',
  'When',
  'Scotland'],
 ['CONGRATS'],
 ['yeaaaah',
  'yippppy',
  'accnt',
  'verified',
  'rqst',
  'succeed',
  'got',
  'blue',
  'tick',
  'mark',
  'fb',
  'profile',
  'NUM',
  'days'],
 ['This', 'one', 'irresistible'],
 ['We',
  "n't",
  'like',
  'keep',
  'lovely',
  'customers',
  'waiting',
  'long',
  'We',
  'hope',
  'enjoy',
  'Happy',
  'Friday',
  'LWWF'],
 ['On',
  'second',
  'thought',
  '‚Äô',
  'enough',
  'time',
  'DD',
  'But',
  'new',
  'shorts',
  'entering',
  'system',
  'Sheep',
  'must',
  'buying'],
 ['Jgh', 'go', 'Bayan', 'D', 'bye'],
 ['As',
  'act',
  'mischievousness',
  'calling',
  'ETL',
  'layer',
  'house',
  'warehousing',
  'app',
  'Katamari',
  'Well',
  '‚Ä¶',
  'na

### TFIDF Calculations

In [19]:
def get_tf(term, document):

    delimiter = " "
    term_normalized = delimiter.join(utils.canonize(term))

    # print(term_normalized)

    numOfAppearancesInDoc = document.count(term_normalized)
    totalNumOfTermsInDoc = len(document)

    # print("NumOfAppearancesInDoc: {}".format(numOfAppearancesInDoc))
    # print("TotalNumOfTermsInDoc: {}".format(totalNumOfTermsInDoc))
  
    if (totalNumOfTermsInDoc != 0):
        return numOfAppearancesInDoc / totalNumOfTermsInDoc
    else:
        return 0

get_tf("demo string", "This is a 'demo string' in a demo sentence")

0.023809523809523808

In [20]:
def get_idf(term, corpus):

    delimiter = " "
    term_normalized = delimiter.join(utils.canonize(term))

    numOfDocsInCorpus = len(corpus)
    numOfAppearrancesInCorpus = 0
    
    for document in corpus:
        
        if term_normalized in document:
            
            numOfAppearrancesInCorpus += 1

    # print(numOfAppearrancesInCorpus)

    if (numOfAppearrancesInCorpus != 0):
        return numOfDocsInCorpus / numOfAppearrancesInCorpus
    else:
        # Return high IDF value as the term is very uncommon
        return 10000


get_idf("Scotland", normalized_tweets)

2500.0

In [21]:
def get_tfidf(term, document, corpus):

    # print(get_tf(term, document))
    # print(get_idf(term, corpus))

    return get_tf(term, document) * get_idf(term, corpus)


In [87]:
def get_keywords(sentence, corpus, numOfKeywords = 10):

    

    # Get the most frequent named entities (Using the raw string here for better entity recognition)
    doc = nlp(sentence)
    named_entities = GetNamedEntities(doc)

    entityCounts = {}

    for entity in named_entities:
        if entity not in entityCounts:
            entityCounts[entity] = sentence.count(entity)




    # Get the most popular terms that are not entities
    normalized_sentence = utils.canonize(RemoveNamedEntities(sentence, named_entities))
    
    mostRelevantWords = {}


    for term in normalized_sentence:
        if term not in mostRelevantWords:

            pos_tag = nltk.pos_tag(word_tokenize(term))[0][1]

            mostRelevantWords[term] = [get_tfidf(term, normalized_sentence, corpus), pos_tag]
    
    return dict(sorted(mostRelevantWords.items(), key=lambda item: item[1], reverse=True)[:5]), dict(sorted(entityCounts.items(), key=lambda item: item[1], reverse=True)[:5])

In [83]:
pos_tag(word_tokenize("John"))

[('John', 'NNP')]

In [88]:
sentence = "Tried a new recipe for dinner and let's just say... ordering takeout would have been a better idea üçΩÔ∏èüòÖ #KitchenDisasters"

get_keywords(sentence, normalized_tweets)

({'ordering': [714.2857142857142, 'VBG'],
  'takeout': [714.2857142857142, 'NN'],
  'üçΩÔ∏èüòÖ': [714.2857142857142, 'NN'],
  'recipe': [238.0952380952381, 'NN'],
  'Tried': [142.85714285714286, 'VBN']},
 {'KitchenDisasters': 1})

## Test Sentence Generation

In [91]:
from nltk.parse.generate import generate, demo_grammar
from nltk import CFG

spec = """
    S -> NP VP
    NP -> DT NN | DT JJ NN | PRP | NP PP
    VP -> VBD NP | VB NP | VBD ADVP | VB ADVP | VP PP | VP CONJ VP
    PP -> IN NP
    ADVP -> RB | ADVP RB
    DT -> 'the' | 'a' | 'an' | 'this' | 'that' | 'these' | 'those' | 'my' | 'your'
    NN -> 'cat' | 'dog' | 'man' | 'woman' | 'park' | 'book'
    JJ -> 'big' | 'small' | 'green' | 'red' | 'beautiful' | 'old' | 'young'
    RB -> 'quickly' | 'happily' | 'very'
    PRP -> 'I' | 'you' | 'he' | 'she' | 'it' | 'we' | 'they'
    VBD -> 'chased' | 'saw' | 'read' | 'walked'
    VB -> 'chase' | 'see' | 'read' | 'walk'
    IN -> 'in' | 'on' | 'at' | 'by' | 'with'
    CONJ -> 'and' | 'or' | 'but'
    VBG -> 'ordering'
    NN -> 'takeout' | 'üçΩÔ∏èüòÖ' | 'recipe'
    VBN -> 'Tried'
    """

grammar = CFG.fromstring(spec)
print(grammar)

demo_grammar

Grammar with 67 productions (start state = S)
    S -> NP VP
    NP -> DT NN
    NP -> DT JJ NN
    NP -> PRP
    NP -> NP PP
    VP -> VBD NP
    VP -> VB NP
    VP -> VBD ADVP
    VP -> VB ADVP
    VP -> VP PP
    VP -> VP CONJ VP
    PP -> IN NP
    ADVP -> RB
    ADVP -> ADVP RB
    DT -> 'the'
    DT -> 'a'
    DT -> 'an'
    DT -> 'this'
    DT -> 'that'
    DT -> 'these'
    DT -> 'those'
    DT -> 'my'
    DT -> 'your'
    NN -> 'cat'
    NN -> 'dog'
    NN -> 'man'
    NN -> 'woman'
    NN -> 'park'
    NN -> 'book'
    JJ -> 'big'
    JJ -> 'small'
    JJ -> 'green'
    JJ -> 'red'
    JJ -> 'beautiful'
    JJ -> 'old'
    JJ -> 'young'
    RB -> 'quickly'
    RB -> 'happily'
    RB -> 'very'
    PRP -> 'I'
    PRP -> 'you'
    PRP -> 'he'
    PRP -> 'she'
    PRP -> 'it'
    PRP -> 'we'
    PRP -> 'they'
    VBD -> 'chased'
    VBD -> 'saw'
    VBD -> 'read'
    VBD -> 'walked'
    VB -> 'chase'
    VB -> 'see'
    VB -> 'read'
    VB -> 'walk'
    IN -> 'in'
    IN -> 'on'


"\n  S -> NP VP\n  NP -> Det N\n  PP -> P NP\n  VP -> 'slept' | 'saw' NP | 'walked' PP\n  Det -> 'the' | 'a'\n  N -> 'man' | 'park' | 'dog'\n  P -> 'in' | 'with'\n"

In [92]:
for sentence in generate(grammar, n=10):
    print(' '.join(sentence))

the cat chased the cat
the cat chased the dog
the cat chased the man
the cat chased the woman
the cat chased the park
the cat chased the book
the cat chased the takeout
the cat chased the üçΩÔ∏èüòÖ
the cat chased the recipe
the cat chased a cat
