In [None]:
# importing libraries and creating a corpus

In [1]:
from collections import Counter, defaultdict
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
corpus = [
    "Artificial intelligence (AI) is the simulation of human intelligence in machines programmed to think and mimic cognitive functions.",
    "AI can be categorized into narrow AI, which is designed for a particular task, and general AI, which aims to perform any intellectual task that a human can.",
    "The field of AI involves various subfields like machine learning, natural language processing, computer vision, and robotics.",
    "Deep learning, a subset of machine learning, involves neural networks with many layers (deep neural networks) and has been successful in various AI applications.",
    "Ethical considerations and responsible AI development are becoming increasingly important as AI technologies advance.",
    "AI is used in diverse applications, from virtual assistants and recommendation systems to autonomous vehicles and medical diagnosis.",
    "The Turing test, proposed by Alan Turing, is a measure of a machine's ability to exhibit intelligent behavior indistinguishable from that of a human.",
    "AI has the potential to revolutionize industries, but it also raises concerns about job displacement and the ethical implications of autonomous decision-making.",
    "Researchers and policymakers are working on guidelines and regulations to ensure the ethical and responsible use of AI technologies.",
    "As AI continues to evolve, interdisciplinary collaboration and public awareness play crucial roles in shaping its impact on society."
]


In [3]:
# N gram model 

In [4]:
# preprocessing 

In [10]:
def preprocess_corpus(corpus):
    return [['<s>'] + sentence.lower().split() + ['</s>'] for sentence in corpus]

In [11]:
def build_n_gram_model(corpus, n=2):
    model = defaultdict(Counter)
    for sentence in corpus:
        for i in range(len(sentence)-n+1):
            n_gram_sequence = tuple(sentence[i:i+n-1])
            next_word = sentence[i+n-1]
            model[n_gram_sequence][next_word] += 1
    return model

In [12]:
# Calculating the N-Gram probabilities

def calculate_n_gram_probabilities(model):
    probabilities = {}
    for n_gram_sequence, words in model.items():
        total_count = sum(words.values())
        probabilities[n_gram_sequence] = {word: count/total_count for word, count in words.items()}
    return probabilities

In [13]:
# Calculating the  perplexity for N-Gram model
def calculate_perplexity(model, corpus, n=2):
    N = sum(len(sentence) for sentence in corpus)
    logprob = 0
    for sentence in corpus:
        for i in range(n-1, len(sentence)):
            n_gram_sequence = tuple(sentence[i-n+1:i])
            word = sentence[i]
            probability = model.get(n_gram_sequence, {}).get(word, 1e-12)  # Smoothing for zero probabilities
            logprob += np.log2(probability)
    perplexity = 2 ** (-logprob / N)
    return perplexity

In [14]:
preprocessed_corpus = preprocess_corpus(corpus)

In [19]:
preprocessed_corpus[1]

['<s>',
 'ai',
 'can',
 'be',
 'categorized',
 'into',
 'narrow',
 'ai,',
 'which',
 'is',
 'designed',
 'for',
 'a',
 'particular',
 'task,',
 'and',
 'general',
 'ai,',
 'which',
 'aims',
 'to',
 'perform',
 'any',
 'intellectual',
 'task',
 'that',
 'a',
 'human',
 'can.',
 '</s>']

In [20]:
n_gram_model = build_n_gram_model(preprocessed_corpus, 2)
n_gram_model

defaultdict(collections.Counter,
            {('<s>',): Counter({'ai': 3,
                      'the': 2,
                      'artificial': 1,
                      'deep': 1,
                      'ethical': 1,
                      'researchers': 1,
                      'as': 1}),
             ('artificial',): Counter({'intelligence': 1}),
             ('intelligence',): Counter({'(ai)': 1, 'in': 1}),
             ('(ai)',): Counter({'is': 1}),
             ('is',): Counter({'the': 1, 'designed': 1, 'used': 1, 'a': 1}),
             ('the',): Counter({'ethical': 2,
                      'simulation': 1,
                      'field': 1,
                      'turing': 1,
                      'potential': 1}),
             ('simulation',): Counter({'of': 1}),
             ('of',): Counter({'ai': 2,
                      'a': 2,
                      'human': 1,
                      'machine': 1,
                      'autonomous': 1}),
             ('human',): Counter({'intellige

In [21]:
n_gram_probabilities = calculate_n_gram_probabilities(n_gram_model)

In [22]:
# Print example N-Gram probabilities (for brevity, print probabilities of the first few N-Grams)
print("Example N-Gram probabilities:")
for n_gram, probabilities in list(n_gram_probabilities.items())[:5]:
    print(f"{n_gram}: {probabilities}")

Example N-Gram probabilities:
('<s>',): {'artificial': 0.1, 'ai': 0.3, 'the': 0.2, 'deep': 0.1, 'ethical': 0.1, 'researchers': 0.1, 'as': 0.1}
('artificial',): {'intelligence': 1.0}
('intelligence',): {'(ai)': 0.5, 'in': 0.5}
('(ai)',): {'is': 1.0}
('is',): {'the': 0.25, 'designed': 0.25, 'used': 0.25, 'a': 0.25}


In [23]:
n_gram_perplexity = calculate_perplexity(n_gram_probabilities, preprocessed_corpus, 2)

In [24]:
# Print the perplexity of the N-Gram model
print(f"\nPerplexity of the N-Gram model: {n_gram_perplexity}")


Perplexity of the N-Gram model: None


In [25]:
#N-gram neural network model (e.g., Trigram)

In [28]:
!pip install torch

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/1e/86/477ec85bf1f122931f00a2f3889ed9322c091497415a563291ffc119dacc/torch-2.1.2-cp311-none-macosx_11_0_arm64.whl.metadata
  Downloading torch-2.1.2-cp311-none-macosx_11_0_arm64.whl.metadata (25 kB)
Downloading torch-2.1.2-cp311-none-macosx_11_0_arm64.whl (59.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: torch
Successfully installed torch-2.1.2


In [29]:
# Define the corpus
corpus = [
    "Natural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics.", 
    "It is primarily concerned with giving computers the ability to support and manipulate human language.", 
    "It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic machine learning approaches.", 
    "The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them.",
    "The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.",
    "Machine learning is a field of study in artificial intelligence concerned with the development of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
    "Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.",
    "Machine learning approaches have been applied to many fields including large language models, computer vision, and speech recognition.",
    "Machine learning is known in its application across business problems under the name predictive analytics.",
    "Although not all machine learning is statistically based, computational statistics is an important source of the field's methods."
]


In [30]:
corpus = '\n'.join(corpus).split()
vocab = set(corpus)
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

In [31]:
trigram = [((corpus[i], corpus[i + 1]), corpus[i + 2])
           for i in range(len(corpus) - 2)]

In [32]:
trigram[:10]

[(('Natural', 'language'), 'processing'),
 (('language', 'processing'), '(NLP)'),
 (('processing', '(NLP)'), 'is'),
 (('(NLP)', 'is'), 'an'),
 (('is', 'an'), 'interdisciplinary'),
 (('an', 'interdisciplinary'), 'subfield'),
 (('interdisciplinary', 'subfield'), 'of'),
 (('subfield', 'of'), 'computer'),
 (('of', 'computer'), 'science'),
 (('computer', 'science'), 'and')]

In [33]:
class NgramModel(nn.Module):
    def __init__(self, vocb_size, context_size, n_dim):
        super(NgramModel, self).__init__()
        self.n_word = vocb_size
        self.embedding = nn.Embedding(self.n_word, n_dim)
        self.linear1 = nn.Linear(context_size * n_dim, 128)
        self.linear2 = nn.Linear(128, self.n_word)

    def forward(self, x):
        emb = self.embedding(x)
        emb = emb.view(1, -1)
        out = self.linear1(emb)
        out = F.relu(out)
        out = self.linear2(out)
        log_prob = F.log_softmax(out)
        return log_prob

In [34]:
CONTEXT_SIZE = 2
ngrammodel = NgramModel(len(word_to_idx), CONTEXT_SIZE, 100)

In [35]:
criterion = nn.NLLLoss()
optimizer = optim.SGD(ngrammodel.parameters(), lr=1e-3)

In [36]:
for epoch in range(100):
    print('epoch: {}'.format(epoch + 1))
    print('*' * 10)
    running_loss = 0
    for data in trigram:
        word, label = data     #E.g., word = ('Natural', 'language'); label = 'processing'
        word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
        label = Variable(torch.LongTensor([word_to_idx[label]]))
        
        # forward -- for prediction and calculating the loss for each instance
        out = ngrammodel(word)
        loss = criterion(out, label)    
        running_loss += loss.item()
        
        # backward -- for gradiate update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # print the average loss
    print('Loss: {:.6f}'.format(running_loss / len(word_to_idx)))
    

epoch: 1
**********
Loss: 7.202061
epoch: 2
**********
Loss: 7.117636
epoch: 3
**********
Loss: 7.034456
epoch: 4
**********
Loss: 6.952282
epoch: 5
**********
Loss: 6.870747
epoch: 6
**********
Loss: 6.789623
epoch: 7
**********


  log_prob = F.log_softmax(out)


Loss: 6.708892
epoch: 8
**********
Loss: 6.628068
epoch: 9
**********
Loss: 6.546749
epoch: 10
**********
Loss: 6.465104
epoch: 11
**********
Loss: 6.382518
epoch: 12
**********
Loss: 6.299143
epoch: 13
**********
Loss: 6.214832
epoch: 14
**********
Loss: 6.129460
epoch: 15
**********
Loss: 6.042929
epoch: 16
**********
Loss: 5.954959
epoch: 17
**********
Loss: 5.865716
epoch: 18
**********
Loss: 5.775320
epoch: 19
**********
Loss: 5.683646
epoch: 20
**********
Loss: 5.591035
epoch: 21
**********
Loss: 5.497446
epoch: 22
**********
Loss: 5.403441
epoch: 23
**********
Loss: 5.309322
epoch: 24
**********
Loss: 5.214977
epoch: 25
**********
Loss: 5.120785
epoch: 26
**********
Loss: 5.026653
epoch: 27
**********
Loss: 4.932806
epoch: 28
**********
Loss: 4.839269
epoch: 29
**********
Loss: 4.745600
epoch: 30
**********
Loss: 4.652169
epoch: 31
**********
Loss: 4.558508
epoch: 32
**********
Loss: 4.465082
epoch: 33
**********
Loss: 4.371696
epoch: 34
**********
Loss: 4.278191
epoch: 35
*****

In [37]:

word, label = trigram[3]
print(word, '\t', label)

('(NLP)', 'is') 	 an


In [38]:
print(word_to_idx['(NLP)'], '\t', word_to_idx['is'])

89 	 36


In [39]:
word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
out = ngrammodel(word)

  log_prob = F.log_softmax(out)


In [40]:

out

tensor([[ -6.8025,  -6.6306,  -7.6776,  -8.4790,  -7.2053,  -8.0369,  -9.3851,
          -8.2271,  -8.9402,  -9.0661,  -6.9189,  -7.8263,  -7.3986,  -8.1753,
          -9.2513,  -8.3883,  -8.8400,  -6.8880,  -5.9797,  -7.0213,  -8.0637,
          -8.3375,  -8.4219,  -8.0314,  -7.8549,  -8.5214,  -7.7298,  -7.9354,
          -9.2482,  -7.1365,  -6.7070,  -6.7819,  -6.8558,  -8.9761,  -6.3744,
          -7.6374,  -7.3776,  -8.7189,  -7.6748,  -8.2060,  -8.7828,  -6.5718,
          -7.8516,  -5.5209,  -8.9169,  -7.6570,  -7.1296,  -7.9354,  -7.4967,
          -7.3807,  -8.7286,  -4.4824,  -7.3743,  -8.1332,  -8.0992,  -4.3666,
          -7.5409,  -8.7871,  -7.7525,  -8.5492,  -6.0472,  -4.5289,  -7.3960,
         -10.1769,  -6.0644,  -4.7673,  -8.6896,  -7.5080,  -8.9787,  -7.2784,
          -0.1526,  -8.1846,  -6.6605,  -8.1420,  -8.1736,  -7.8749,  -8.4260,
          -9.1426,  -7.9542,  -8.2479,  -8.7789,  -8.4692,  -7.8952,  -8.1765,
          -6.1787,  -7.9131,  -7.5189,  -8.3561,  -7

In [41]:
_, predict_label = torch.max(out, 1)


In [42]:
predict_label

tensor([70])

In [43]:
predict_word = idx_to_word[predict_label.item()]
print('real word is: {},\npredict word is: {}'.format(label, predict_word))

real word is: an,
predict word is: an
