## Coding Exercise #0609

In [2]:
import nltk
from numpy.random import randint, seed
from sklearn.feature_extraction.text import CountVectorizer

### 1. n-Gram based autofill:

In [3]:
# Text data for training.
my_text = """Machine learning is the scientific study of algorithms and statistical models that computer systems use to effectively perform a specific task without using explicit instructions, relying on patterns and inference instead. It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model of sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task.[1][2]:2 Machine learning algorithms are used in the applications of email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning In its application across business problems, machine learning is also referred to as predictive analytics."""

In [4]:
my_text = [my_text.lower()]                       # Convert to lowercase and make a list. => Required by the CountVectorizer().

#### 1.1. n-Gram trial run:

In [5]:
n = 3                                                            # Can be changed to a number equal or larger than 2.
n_min = n
n_max = n
n_gram_type = 'word'                                             # n-Gram with words.
vectorizer = CountVectorizer(ngram_range=(n_min,n_max), analyzer = n_gram_type)

In [6]:
n_grams = vectorizer.fit(my_text).get_feature_names()            # Get the n-Grams as a list.
n_gram_cts = vectorizer.transform(my_text).toarray()             #  The output is an array of array.
n_gram_cts = list(n_gram_cts[0])                                 # Convert into a simple list.

In [7]:
list(zip(n_grams,n_gram_cts))                                    # Make a list of tuples and show.

[('across business problems', 1),
 ('algorithm of specific', 1),
 ('algorithms and statistical', 1),
 ('algorithms are used', 1),
 ('algorithms build mathematical', 1),
 ('also referred to', 1),
 ('an algorithm of', 1),
 ('analysis through unsupervised', 1),
 ('and application domains', 1),
 ('and computer vision', 1),
 ('and focuses on', 1),
 ('and inference instead', 1),
 ('and statistical models', 1),
 ('application across business', 1),
 ('application domains to', 1),
 ('applications of email', 1),
 ('are used in', 1),
 ('artificial intelligence machine', 1),
 ('as predictive analytics', 1),
 ('as subset of', 1),
 ('as training data', 1),
 ('being explicitly programmed', 1),
 ('build mathematical model', 1),
 ('business problems machine', 1),
 ('closely related to', 1),
 ('computational statistics which', 1),
 ('computer systems use', 1),
 ('computer vision where', 1),
 ('computers the study', 1),
 ('data analysis through', 1),
 ('data in order', 1),
 ('data known as', 1),
 ('data 

#### 1.2. Train by making a dictionary based on n-Grams:

In [8]:
n = 3                                                           # Can be changed to a number equal or larger than 2.
n_min = n                              
n_max = n                              
n_gram_type = 'word'
vectorizer = CountVectorizer(ngram_range=(n_min,n_max), analyzer = n_gram_type)

In [9]:
n_grams = vectorizer.fit(my_text).get_feature_names()           # A list of n-Grams.
my_dict = {}
for a_gram in n_grams:
    words = nltk.word_tokenize(a_gram)
    a_nm1_gram = ' '.join(words[0:n-1])                         # (n-1)-Gram.
    next_word = words[-1]                                       # Word after the a_nm1_gram.
    if a_nm1_gram not in my_dict.keys():
        my_dict[a_nm1_gram] = [next_word]                       # a_nm1_gram is a new key. So, initialize the dictionary entry.
    else:
        my_dict[a_nm1_gram] += [next_word]                      # an_nm1_gram is already in the dictionary.

In [10]:
# View the dictionary.
my_dict

{'across business': ['problems'],
 'algorithm of': ['specific'],
 'algorithms and': ['statistical'],
 'algorithms are': ['used'],
 'algorithms build': ['mathematical'],
 'also referred': ['to'],
 'an algorithm': ['of'],
 'analysis through': ['unsupervised'],
 'and application': ['domains'],
 'and computer': ['vision'],
 'and focuses': ['on'],
 'and inference': ['instead'],
 'and statistical': ['models'],
 'application across': ['business'],
 'application domains': ['to'],
 'applications of': ['email'],
 'are used': ['in'],
 'artificial intelligence': ['machine'],
 'as predictive': ['analytics'],
 'as subset': ['of'],
 'as training': ['data'],
 'being explicitly': ['programmed'],
 'build mathematical': ['model'],
 'business problems': ['machine'],
 'closely related': ['to'],
 'computational statistics': ['which'],
 'computer systems': ['use'],
 'computer vision': ['where'],
 'computers the': ['study'],
 'data analysis': ['through'],
 'data in': ['order'],
 'data known': ['as'],
 'data m

#### 1.3. Predict the next word:

In [11]:
# Helper function that picks the following word.
def predict_next(a_nm1_gram):
    value_list_size = len(my_dict[a_nm1_gram])         # length of the value corresponding to the key = a_nm1_gram.
    i_pick = randint(0, value_list_size)               # A random number from the range 0 ~ value_list_size.
    return(my_dict[a_nm1_gram][i_pick])                  # Return the randomly chosen next word.

In [12]:
# Test.
input_str = 'order to'                                 # Has to be a VALID (n-1)-Gram!
predict_next(input_str)

'make'

In [13]:
# Another test.
# Repeat for 10 times and see that the next word is chosen randomly with a probability proportional to the occurrence. 
input_str = 'machine learning'                                 # Has to be a VALID (n-1)-Gram!
for i in range(10):
    print(predict_next(input_str))

algorithms
is
and
is
is
data
is
is
and
is


#### 1.4. Predict a sequence:

In [14]:
# Initialize the random seed.
seed(123)

In [15]:
# A seed string has to be input by the user.
my_seed_str = 'machine learning'                                   # Has to be a VALID (n-1)-Gram!
# my_seed_str = 'in order'                                         # Has to be a VALID (n-1)-Gram!

In [18]:
a_nm1_gram = my_seed_str
output_string = my_seed_str                                         # Initialize the output string.
while a_nm1_gram in my_dict:
    output_string += " " + predict_next(a_nm1_gram)
    words = nltk.word_tokenize(output_string)
    a_nm1_gram = ' '.join(words[-n+1:])                            # Update a_nm1_gram.

In [17]:
# Output the predicted sequence.
output_string

'machine learning data mining is field of study within machine learning data mining is field of machine learning algorithms are used in the applications of email filtering detection of network intruders and computer vision where it is infeasible to develop an algorithm of specific instructions for performing the task machine learning and focuses on making predictions using computers the study of algorithms and statistical models that computer systems use to effectively perform specific task without using explicit instructions relying on patterns and inference instead it is seen as subset of artificial intelligence machine learning and focuses on exploratory data analysis through unsupervised learning in its application across business problems machine learning and focuses on exploratory data analysis through unsupervised learning in its application across business problems machine learning and focuses on exploratory data analysis through unsupervised learning in its application across 