<a href="https://colab.research.google.com/github/sanyagupta31/ml-projects/blob/main/next_word_prediction_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [2]:
data=pd.read_csv('/content/medium_data.csv')
data.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [3]:
tokenizer = Tokenizer()

In [7]:
tokenizer.fit_on_texts(['title'])

In [8]:
len(tokenizer.word_index)

10969

In [10]:
# Assuming 'title' is a column in your 'data' DataFrame
title = data['title'].str.cat(sep='\n')  # Concatenate all titles into a single string

input_sequences = []
for sentence in title.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
  for i in range(1, len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [11]:
input_sequences

[[4, 565],
 [4, 565, 60],
 [4, 565, 60, 1],
 [4, 565, 60, 1, 434],
 [4, 565, 60, 1, 434, 1310],
 [4, 565, 60, 1, 434, 1310, 14],
 [4, 565, 60, 1, 434, 1310, 14, 3507],
 [4, 565, 60, 1, 434, 1310, 14, 3507, 3508],
 [3509, 21],
 [3509, 21, 782],
 [3509, 21, 782, 111],
 [3509, 21, 782, 111, 157],
 [3509, 21, 782, 111, 157, 14],
 [3509, 21, 782, 111, 157, 14, 477],
 [3509, 21, 782, 111, 157, 14, 477, 477],
 [3509, 21, 782, 111, 157, 14, 477, 477, 1650],
 [5, 1],
 [5, 1, 62],
 [5, 1, 62, 3510],
 [5, 1, 62, 3510, 192],
 [3511, 5],
 [3511, 5, 1],
 [3511, 5, 1, 231],
 [3511, 5, 1, 231, 1073],
 [3511, 5, 1, 231, 1073, 10],
 [3511, 5, 1, 231, 1073, 10, 2216],
 [3511, 5, 1, 231, 1073, 10, 2216, 21],
 [3511, 5, 1, 231, 1073, 10, 2216, 21, 9],
 [3511, 5, 1, 231, 1073, 10, 2216, 21, 9, 3512],
 [4, 169],
 [4, 169, 63],
 [4, 169, 63, 169],
 [4, 169, 63, 169, 398],
 [4, 169, 63, 169, 398, 6],
 [4, 169, 63, 169, 398, 6, 3513],
 [4, 169, 63, 169, 398, 6, 3513, 2217],
 [4, 169, 63, 169, 398, 6, 3513, 2217

In [12]:
max_len=max([len(x)for x in input_sequences])

In [13]:
padded_input_sequences=pad_sequences(input_sequences,maxlen=max_len,padding='pre')

In [14]:
padded_input_sequences

array([[    0,     0,     0, ...,     0,     4,   565],
       [    0,     0,     0, ...,     4,   565,    60],
       [    0,     0,     0, ...,   565,    60,     1],
       ...,
       [    0,     0,     0, ...,    64,     4,   104],
       [    0,     0,     0, ...,     4,   104,    65],
       [    0,     0,     0, ...,   104,    65, 10969]], dtype=int32)

In [15]:
X=padded_input_sequences[:,:-1]
y=padded_input_sequences[:,-1]

In [16]:
X.shape

(43439, 37)

In [17]:
y.shape

(43439,)

In [19]:
from tensorflow.keras.utils import to_categorical
y=to_categorical(y,num_classes=len(tokenizer.word_index)+1)

In [20]:
y.shape

(43439, 10970)

In [25]:
model=Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,output_dim=100,input_length=max_len-1))
model.add(Bidirectional(LSTM(150,return_sequences=True)))
model.add(LSTM(100))
model.add(Dense(len(tokenizer.word_index)+1,activation='softmax'))
adam=Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy',optimizer=adam,metrics=['accuracy'])
history=model.fit(X,y,epochs=100,verbose=1)
print(model)



Epoch 1/100
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 14ms/step - accuracy: 0.0491 - loss: 7.7847
Epoch 2/100
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 14ms/step - accuracy: 0.0929 - loss: 6.9201
Epoch 3/100
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 14ms/step - accuracy: 0.1168 - loss: 6.6618
Epoch 4/100
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 14ms/step - accuracy: 0.1281 - loss: 6.4187
Epoch 5/100
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - accuracy: 0.1422 - loss: 6.2020
Epoch 6/100
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 14ms/step - accuracy: 0.1516 - loss: 6.0009
Epoch 7/100
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - accuracy: 0.1567 - loss: 5.8337
Epoch 8/100
[1m1358/1358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - accuracy: 0.1650 - loss: 5.6211


In [26]:
import time
text= "implementation of"
for i in range(10):
  token_text=tokenizer.texts_to_sequences([text])[0]
  padded_token_text=pad_sequences([token_text],maxlen=max_len-1,padding='pre')
  pos=np.argmax(model.predict(padded_token_text))
  for word,index in tokenizer.word_index.items():
    if index==pos:
      text=text+" "+word
      print(text)
      time.sleep(2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step
implementation of rnn
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
implementation of rnn lstm
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
implementation of rnn lstm and gru
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
implementation of rnn lstm and gru evaluation
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
implementation of rnn lstm and gru evaluation metrics
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
implementation of rnn lstm and gru evaluation metrics every
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
implementation of rnn lstm and gru evaluation metrics every data
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
implementation of rnn lstm and gru evaluation metrics every data scientist
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[