<a href="https://colab.research.google.com/github/sammatuba/AI-NLP-Codecamp/blob/master/Advanced_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import re

In [0]:
# N-gram

def generate_ngrams(text, n):
    # Convert to lowercases
    doc = text.lower()
    
    # Replace all none alphanumeric characters with spaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in doc.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    ngrams = zip(*[tokens[i:] for i in range(n)])
    
    # Concatentate the tokens into ngrams and return
    return [" ".join(ngram) for ngram in ngrams]

In [0]:
sample_text = """He said thank you. He said bye as he walked through the door. He went to San Diego. San Diego has nice weather. It is raining in San Francisco."""

In [6]:
generate_ngrams(sample_text, n=3)

['he said',
 'said thank',
 'thank you',
 'you he',
 'he said',
 'said bye',
 'bye as',
 'as he',
 'he walked',
 'walked through',
 'through the',
 'the door',
 'door he',
 'he went',
 'went to',
 'to san',
 'san diego',
 'diego san',
 'san diego',
 'diego has',
 'has nice',
 'nice weather',
 'weather it',
 'it is',
 'is raining',
 'raining in',
 'in san',
 'san francisco']

In [0]:
# TF/IDF/TF-IDF

import math
import re
import statistics

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [9]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
def clean_text(document):
    """
    This function removes all unnecessary symbols and whitespaces and returns a
    clean document text
    """
    document = document.replace("\n", " ")
    document = document.replace("“", "\"")
    document = document.replace("”", "\"")
    document = re.sub("[^\w\s]", "", document)
    document = re.sub("\s+", " ", document)
    document = document.strip()
    
    return document

def clean_sentences(document):
    """
    This function tokenizes the document into sentences and returns the clean
    sentences in an array
    """
    text_sentences = sent_tokenize(document)
    clean_text_sentences = [clean_text(sentence) for sentence in text_sentences]
    
    return clean_text_sentences

In [0]:
def count_words(sentence):
    """
    This function returns the count of words in a sentence for TF/IDF/TF-IDF
    computation
    """
    count = 0
    words = word_tokenize(sentence)
    
    for word in words:
        count += 1
    
    return count


def create_doc_info(clean_sentences):
    """
    This function creates a dictionary of each sentence having their id and
    word count
    """
    doc_info = []
    i = 0
    
    for sentence in clean_sentences:
        i += 1
        count = count_words(sentence)
        doc_dict = {"doc_id": i, "doc_length": count}
        doc_info.append(doc_dict)
        
    return doc_info


def create_freq_dist(clean_sentences):
    """
    This function creates a dictionary of each sentence and their word
    frequency distributions
    """
    i = 0
    freq_dict_list = []
    
    for sentence in clean_sentences:
        i += 1
        freq_dict = {}
        words = word_tokenize(sentence)
        
        for word in words:
            word = word.lower()
            if word in freq_dict:
                freq_dict[word] += 1
            else:
                freq_dict[word] = 1
            doc_dict = {"doc_id": i, "freq_dict": freq_dict}
            
        freq_dict_list.append(doc_dict)
        
    return freq_dict_list

In [0]:
def compute_tf(doc_info, freq_dict_list):
    """
    This function manually computes the TF values of each sentence
    """
    tf_scores = []
    
    for temp_dict in freq_dict_list:
        doc_id = temp_dict["doc_id"]
        
        for k in temp_dict['freq_dict']:
            tf_score = temp_dict["freq_dict"][k] / doc_info[doc_id - 1]["doc_length"]
            doc_dict = {"doc_id": doc_id, "tf_score": tf_score, "key": k}
            
            tf_scores.append(doc_dict)
        
    return tf_scores


def compute_idf(doc_info, freq_dict_list):
    """
    This function manually computes the IDF values of each sentence
    """
    idf_scores = []
    counter = 0
    
    for temp_dict in freq_dict_list:
        counter += 1
        
        for k in temp_dict["freq_dict"].keys():
            count = sum([k in temp["freq_dict"] for temp in freq_dict_list])
            idf_score = math.log(len(doc_info) / count)
            doc_dict = {"doc_id": counter, "idf_score": idf_score, "key": k}
            
            idf_scores.append(doc_dict)
            
    return idf_scores


def compute_tfidf(tf_scores, idf_scores):
    """
    This function manually computes the TF-IDF values of each sentence
    """
    tfidf_scores = []
    
    for idf in idf_scores:
        for tf in tf_scores:
            if idf["key"] == tf["key"] and idf["doc_id"] == tf["doc_id"]:
                doc_dict = {"doc_id": idf["doc_id"],
                            "tfidf_score": idf["idf_score"]*tf["tf_score"],
                            "key": tf["key"]}
                
        tfidf_scores.append(doc_dict)
        
    return tfidf_scores

In [0]:
document = """He said thank you. He said bye as he walked through the door. He 
went to San Diego. San Diego has nice weather. It is raining in San 
Francisco."""

In [15]:
clean_document = clean_sentences(document)
print(clean_document)

['He said thank you', 'He said bye as he walked through the door', 'He went to San Diego', 'San Diego has nice weather', 'It is raining in San Francisco']


In [16]:
doc_info = create_doc_info(clean_document)
freq_dict_list = create_freq_dist(clean_document)

print(doc_info)
print(freq_dict_list)

[{'doc_id': 1, 'doc_length': 4}, {'doc_id': 2, 'doc_length': 9}, {'doc_id': 3, 'doc_length': 5}, {'doc_id': 4, 'doc_length': 5}, {'doc_id': 5, 'doc_length': 6}]
[{'doc_id': 1, 'freq_dict': {'he': 1, 'said': 1, 'thank': 1, 'you': 1}}, {'doc_id': 2, 'freq_dict': {'he': 2, 'said': 1, 'bye': 1, 'as': 1, 'walked': 1, 'through': 1, 'the': 1, 'door': 1}}, {'doc_id': 3, 'freq_dict': {'he': 1, 'went': 1, 'to': 1, 'san': 1, 'diego': 1}}, {'doc_id': 4, 'freq_dict': {'san': 1, 'diego': 1, 'has': 1, 'nice': 1, 'weather': 1}}, {'doc_id': 5, 'freq_dict': {'it': 1, 'is': 1, 'raining': 1, 'in': 1, 'san': 1, 'francisco': 1}}]


In [0]:
tf_scores = compute_tf(doc_info, freq_dict_list)

In [0]:
idf_scores = compute_idf(doc_info, freq_dict_list)

In [0]:
tfidf_scores = compute_tfidf(tf_scores, idf_scores)

In [21]:
tf_scores

[{'doc_id': 1, 'key': 'he', 'tf_score': 0.25},
 {'doc_id': 1, 'key': 'said', 'tf_score': 0.25},
 {'doc_id': 1, 'key': 'thank', 'tf_score': 0.25},
 {'doc_id': 1, 'key': 'you', 'tf_score': 0.25},
 {'doc_id': 2, 'key': 'he', 'tf_score': 0.2222222222222222},
 {'doc_id': 2, 'key': 'said', 'tf_score': 0.1111111111111111},
 {'doc_id': 2, 'key': 'bye', 'tf_score': 0.1111111111111111},
 {'doc_id': 2, 'key': 'as', 'tf_score': 0.1111111111111111},
 {'doc_id': 2, 'key': 'walked', 'tf_score': 0.1111111111111111},
 {'doc_id': 2, 'key': 'through', 'tf_score': 0.1111111111111111},
 {'doc_id': 2, 'key': 'the', 'tf_score': 0.1111111111111111},
 {'doc_id': 2, 'key': 'door', 'tf_score': 0.1111111111111111},
 {'doc_id': 3, 'key': 'he', 'tf_score': 0.2},
 {'doc_id': 3, 'key': 'went', 'tf_score': 0.2},
 {'doc_id': 3, 'key': 'to', 'tf_score': 0.2},
 {'doc_id': 3, 'key': 'san', 'tf_score': 0.2},
 {'doc_id': 3, 'key': 'diego', 'tf_score': 0.2},
 {'doc_id': 4, 'key': 'san', 'tf_score': 0.2},
 {'doc_id': 4, 'key'

In [22]:
idf_scores

[{'doc_id': 1, 'idf_score': 0.5108256237659907, 'key': 'he'},
 {'doc_id': 1, 'idf_score': 0.9162907318741551, 'key': 'said'},
 {'doc_id': 1, 'idf_score': 1.6094379124341003, 'key': 'thank'},
 {'doc_id': 1, 'idf_score': 1.6094379124341003, 'key': 'you'},
 {'doc_id': 2, 'idf_score': 0.5108256237659907, 'key': 'he'},
 {'doc_id': 2, 'idf_score': 0.9162907318741551, 'key': 'said'},
 {'doc_id': 2, 'idf_score': 1.6094379124341003, 'key': 'bye'},
 {'doc_id': 2, 'idf_score': 1.6094379124341003, 'key': 'as'},
 {'doc_id': 2, 'idf_score': 1.6094379124341003, 'key': 'walked'},
 {'doc_id': 2, 'idf_score': 1.6094379124341003, 'key': 'through'},
 {'doc_id': 2, 'idf_score': 1.6094379124341003, 'key': 'the'},
 {'doc_id': 2, 'idf_score': 1.6094379124341003, 'key': 'door'},
 {'doc_id': 3, 'idf_score': 0.5108256237659907, 'key': 'he'},
 {'doc_id': 3, 'idf_score': 1.6094379124341003, 'key': 'went'},
 {'doc_id': 3, 'idf_score': 1.6094379124341003, 'key': 'to'},
 {'doc_id': 3, 'idf_score': 0.5108256237659907,

In [23]:
tfidf_scores

[{'doc_id': 1, 'key': 'he', 'tfidf_score': 0.12770640594149768},
 {'doc_id': 1, 'key': 'said', 'tfidf_score': 0.22907268296853878},
 {'doc_id': 1, 'key': 'thank', 'tfidf_score': 0.40235947810852507},
 {'doc_id': 1, 'key': 'you', 'tfidf_score': 0.40235947810852507},
 {'doc_id': 2, 'key': 'he', 'tfidf_score': 0.11351680528133126},
 {'doc_id': 2, 'key': 'said', 'tfidf_score': 0.10181008131935056},
 {'doc_id': 2, 'key': 'bye', 'tfidf_score': 0.17882643471490003},
 {'doc_id': 2, 'key': 'as', 'tfidf_score': 0.17882643471490003},
 {'doc_id': 2, 'key': 'walked', 'tfidf_score': 0.17882643471490003},
 {'doc_id': 2, 'key': 'through', 'tfidf_score': 0.17882643471490003},
 {'doc_id': 2, 'key': 'the', 'tfidf_score': 0.17882643471490003},
 {'doc_id': 2, 'key': 'door', 'tfidf_score': 0.17882643471490003},
 {'doc_id': 3, 'key': 'he', 'tfidf_score': 0.10216512475319815},
 {'doc_id': 3, 'key': 'went', 'tfidf_score': 0.3218875824868201},
 {'doc_id': 3, 'key': 'to', 'tfidf_score': 0.3218875824868201},
 {'d

In [0]:
# Word Embeddings

import spacy

In [27]:
spacy.cli.download("en_core_web_md")

# or glove model

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [28]:
# Load the spacy model that you have installed
nlp = spacy.load('en_core_web_md')

OSError: ignored

In [0]:
# Process a sentence using the model
doc = nlp(document)

In [0]:
print(doc)

He said thank you. He said bye as he walked through the door. He 
went to San Diego. San Diego has nice weather. It is raining in San 
Francisco.


In [0]:
print(doc[2].vector)

[-3.2633e-01  2.2670e-01 -6.0390e-01 -7.9733e-02 -3.1348e-01  3.3861e-01
  1.6472e-01 -6.5467e-01 -1.3297e-01  2.4124e+00 -9.6755e-02  2.1780e-02
  3.4751e-01 -5.4402e-02 -4.6641e-01 -2.2268e-01 -3.5988e-01  5.6444e-01
 -2.4593e-01  1.6560e-01  2.9555e-03 -1.2698e-01  8.6455e-02 -1.5530e-01
 -2.7800e-01  1.2190e-01 -9.8423e-02 -1.2733e-01  1.5435e-01 -7.2860e-02
  2.6463e-01  1.4758e-01 -1.2689e-01  2.5180e-01 -3.9769e-01  9.9117e-03
 -3.3437e-01 -1.9860e-01 -1.9786e-01 -1.7713e-01  1.6869e-01  4.9406e-02
 -3.4811e-01 -1.3145e-01  2.4547e-01  4.8623e-01 -1.9794e-01  3.5635e-01
  3.5622e-01 -2.7383e-01  6.9140e-02  1.0416e-01  3.0488e-01 -8.5464e-02
  5.6392e-02 -1.4672e-01 -1.2502e-01  4.5234e-01 -5.8320e-01  2.7536e-01
 -7.0461e-01 -3.1403e-01 -6.8774e-01 -2.5076e-01  5.9797e-02 -2.6168e-01
 -8.3934e-02  2.0090e-01  5.3331e-01  2.0883e-01  3.0193e-01 -9.2298e-02
  2.1038e-01  9.6246e-02  2.8174e-01  2.6589e-01  5.9218e-01 -9.7234e-02
 -2.7597e-01  3.3770e-01  1.1721e-01  6.7690e-01 -2