# **Next Word Generator**

# Utility

In [None]:
!pip install nltk



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk

In [None]:
# Tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
def text_to_indices(sentence, vocab):

  numerical_sentence = []

  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])

  return numerical_sentence


In [None]:
# prediction

def prediction(model, vocab, text):

  # tokenize
  tokenized_text = word_tokenize(text.lower())

  # text -> numerical indices
  numerical_text = text_to_indices(tokenized_text, vocab)

  # padding
  padded_text = torch.tensor([0] * (61 - len(numerical_text)) + numerical_text, dtype=torch.long).unsqueeze(0)

  # send to model
  output = model(padded_text)

  # predicted index
  value, index = torch.max(output, dim=1)

  # merge with text
  return text + " " + list(vocab.keys())[index]

# Data

In [None]:
document = """What is your full name?
My name is Raza Mehar.

What is your current location?
I am currently based in Naples, Italy.

What are your key professional roles?
I work as a Data Scientist with expertise in Machine Learning and Deep Learning. I am also an advocate for AI for Good, leveraging artificial intelligence for meaningful and impactful applications.

What is your highest educational qualification?
I am pursuing an MS in Data Science at the University of Naples Federico II in Italy.

Have you received any special academic distinctions?
Yes, I was awarded Cum Laude in several courses, including Data Mining & Machine Learning, Statistical Learning & Data Analysis, AI System Engineering, Hardware & Software for Big Data, and Theory & Ethics of Big Data.

Do you have any other degrees apart from your MS?
Yes, I also hold an MBA from the Institute of Business Management, Pakistan, and a BS in Computer Engineering from Sir Syed University of Engineering & Technology, Pakistan.

Have you obtained any certifications?
Yes, I have certifications in Applied Machine Learning from the University of Michigan and Data Analytics from Google.

What are your programming skills?
I am proficient in Python, R, and SQL.

What frameworks and libraries do you commonly use?
I work extensively with TensorFlow, PyTorch, Scikit-Learn, Pandas, NumPy, Hugging Face, and LangChain.

What tools and technologies are you familiar with?
I have hands-on experience with Git, Docker, MLFlow, FastAPI, Postgres, and OpenAI.

What are your key areas of expertise?
My expertise includes Predictive Modeling, Natural Language Processing (NLP), and Large Language Models (LLMs).

Do you have experience with data visualization?
Yes, I use Matplotlib and Seaborn for data visualization.

What are your core skills in AI and Data Science?
My core skills include Machine Learning, Deep Learning, Generative AI, Data Analysis, and Data Mining.

What is your English language proficiency?
My English proficiency is at the C1 level.

Have you worked as a Data Science intern?
Yes, I worked as a Data Science Intern at Change2 s.r.l., where I researched emission factors, implemented web scraping and PDF parsing techniques, and conducted an in-depth analysis of sustainability in Italy’s food sector.

Have you done any freelance work?
Yes, I worked as a Machine Learning Engineer at Omdena Inc., where I developed an AI solution to predict indoor classroom temperatures in Tanzanian public schools using LSTM and ensemble methods.

What was your previous work experience before data science?
I worked as an HR Business Partner at Engro Polymer & Chemicals Limited in Pakistan, where I applied data analytics and predictive modeling, particularly Survival Analysis, to enhance HR strategies and reduce employee turnover.

What are some of the projects you have worked on?
I have worked on several projects, including a RAG-based chatbot that allows users to interact with their data, a political leaning detection model for news articles using BERT and LoRA, a plant disease detection system utilizing deep learning and clustering techniques, and a geo-analytics project for estimating market potential for Fater, a P&G subsidiary.

Can you explain your chatbot project?
Yes, I developed a conversational AI chatbot capable of reading and interacting with documents. It utilizes a Retrieval-Augmented Generation (RAG) model and incorporates few-shot learning and chain-of-thought reasoning techniques to provide accurate responses based on document contents.

What was your political leaning detection project about?
This project involved fine-tuning BERT using Low-Rank Adaptation (LoRA) to classify news articles as right-leaning, centrist, or left-leaning. I also optimized the model using post-training quantization (PTQ) to enhance its efficiency and performance.

What was the focus of your plant disease detection project?
I developed a robust system that combined non-negative matrix factorization, fuzzy clustering, and YOLO-based deep learning techniques for accurate plant disease identification.

Can you describe your market potential estimation project?
This project was aimed at assessing the diaper market potential for Fater by analyzing socio-demographic data, geographic information, and points of interest to refine revenue forecasts for Naples stores.

How did you contribute to employee turnover reduction in your HR role?
I used predictive analytics and Survival Analysis to identify at-risk employees and implemented targeted interventions, which led to a 5% reduction in turnover.

Have you worked on AI solutions for social good?
Yes, I contributed to an AI project for Open Development & Education that predicted indoor classroom temperatures in Tanzanian public schools, improving learning conditions for students.

Are you actively involved in AI for Good initiatives?
Yes, I advocate for AI for Good and have worked on various projects that leverage AI to create meaningful social impact.
"""


# Tokenizing and Building Vocabulary

In [None]:
# tokenize
tokens = word_tokenize(document.lower())

In [None]:
# build vocab
vocab = {'<unk>':0}

for token in Counter(tokens).keys():
  if token not in vocab:
    vocab[token] = len(vocab)

vocab

{'<unk>': 0,
 'what': 1,
 'is': 2,
 'your': 3,
 'full': 4,
 'name': 5,
 '?': 6,
 'my': 7,
 'raza': 8,
 'mehar': 9,
 '.': 10,
 'current': 11,
 'location': 12,
 'i': 13,
 'am': 14,
 'currently': 15,
 'based': 16,
 'in': 17,
 'naples': 18,
 ',': 19,
 'italy': 20,
 'are': 21,
 'key': 22,
 'professional': 23,
 'roles': 24,
 'work': 25,
 'as': 26,
 'a': 27,
 'data': 28,
 'scientist': 29,
 'with': 30,
 'expertise': 31,
 'machine': 32,
 'learning': 33,
 'and': 34,
 'deep': 35,
 'also': 36,
 'an': 37,
 'advocate': 38,
 'for': 39,
 'ai': 40,
 'good': 41,
 'leveraging': 42,
 'artificial': 43,
 'intelligence': 44,
 'meaningful': 45,
 'impactful': 46,
 'applications': 47,
 'highest': 48,
 'educational': 49,
 'qualification': 50,
 'pursuing': 51,
 'ms': 52,
 'science': 53,
 'at': 54,
 'the': 55,
 'university': 56,
 'of': 57,
 'federico': 58,
 'ii': 59,
 'have': 60,
 'you': 61,
 'received': 62,
 'any': 63,
 'special': 64,
 'academic': 65,
 'distinctions': 66,
 'yes': 67,
 'was': 68,
 'awarded': 69,
 

In [None]:
len(vocab)

342

In [None]:
input_sentences = document.split('\n')

In [None]:
input_numerical_sentences = []

for sentence in input_sentences:
  input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))


In [None]:
len(input_numerical_sentences)

75

# Genrating Training Sequence

In [None]:
training_sequence = []
for sentence in input_numerical_sentences:

  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])

In [None]:
len(training_sequence)

831

In [None]:
training_sequence[:5]

[[1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6]]

In [None]:
len_list = []

for sequence in training_sequence:
  len_list.append(len(sequence))

max(len_list)

62

In [None]:
training_sequence[0]

[1, 2]

In [None]:
padded_training_sequence = []
for sequence in training_sequence:

  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [None]:
len(padded_training_sequence[10])

62

In [None]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [None]:
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]

# Creating Dataset Class and Customer Data Loader

In [None]:
class CustomDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [None]:
dataset = CustomDataset(X,y)

In [None]:
len(dataset)

831

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Defining the Model

In [None]:
class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

In [None]:
model = LSTMModel(len(vocab))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model.to(device)

LSTMModel(
  (embedding): Embedding(342, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=342, bias=True)
)

In [None]:
epochs = 50
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# training loop

for epoch in range(epochs):
  total_loss = 0

  for batch_x, batch_y in dataloader:

    batch_x, batch_y = batch_x.to(device), batch_y.to(device)

    optimizer.zero_grad()

    output = model(batch_x)

    loss = criterion(output, batch_y)

    loss.backward()

    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 149.7055
Epoch: 2, Loss: 135.2952
Epoch: 3, Loss: 123.1533
Epoch: 4, Loss: 112.9856
Epoch: 5, Loss: 102.4809
Epoch: 6, Loss: 92.8189
Epoch: 7, Loss: 83.2178
Epoch: 8, Loss: 74.1576
Epoch: 9, Loss: 65.8017
Epoch: 10, Loss: 58.1626
Epoch: 11, Loss: 51.2166
Epoch: 12, Loss: 44.9341
Epoch: 13, Loss: 39.1595
Epoch: 14, Loss: 34.2790
Epoch: 15, Loss: 29.9575
Epoch: 16, Loss: 26.3406
Epoch: 17, Loss: 23.1417
Epoch: 18, Loss: 20.5047
Epoch: 19, Loss: 18.1699
Epoch: 20, Loss: 16.3107
Epoch: 21, Loss: 14.7385
Epoch: 22, Loss: 13.3382
Epoch: 23, Loss: 12.2564
Epoch: 24, Loss: 11.2000
Epoch: 25, Loss: 10.4396
Epoch: 26, Loss: 9.7908
Epoch: 27, Loss: 9.1701
Epoch: 28, Loss: 8.6964
Epoch: 29, Loss: 8.2401
Epoch: 30, Loss: 7.8035
Epoch: 31, Loss: 7.4427
Epoch: 32, Loss: 7.1586
Epoch: 33, Loss: 6.8659
Epoch: 34, Loss: 6.6587
Epoch: 35, Loss: 6.3499
Epoch: 36, Loss: 6.2064
Epoch: 37, Loss: 6.0505
Epoch: 38, Loss: 5.8441
Epoch: 39, Loss: 5.7522
Epoch: 40, Loss: 5.6080
Epoch: 41, Loss: 5.

# Predicting

In [None]:
prediction(model, vocab, "My first name is")

'My first name is raza'

In [None]:
import time

num_tokens = 10
input_text = "Yes, I also hold"

for i in range(num_tokens):
  output_text = prediction(model, vocab, input_text)
  print(output_text)
  input_text = output_text
  time.sleep(0.5)


Yes, I also hold an
Yes, I also hold an mba
Yes, I also hold an mba from
Yes, I also hold an mba from the
Yes, I also hold an mba from the institute
Yes, I also hold an mba from the institute of
Yes, I also hold an mba from the institute of business
Yes, I also hold an mba from the institute of business management
Yes, I also hold an mba from the institute of business management ,
Yes, I also hold an mba from the institute of business management , pakistan


In [None]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

# Evaluation

In [None]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # No need to compute gradients
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Get model predictions
            outputs = model(batch_x)

            # Get the predicted word indices
            _, predicted = torch.max(outputs, dim=1)

            # Compare with actual labels
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy

# Compute accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 94.22%
