In [117]:
import os
data_path = os.path.join('.', 'Dataset.txt')

with open(data_path, 'r') as file:
    content = file.read()
content

"Q: What is climate change?\nA: Climate change refers to long-term shifts in temperatures and weather patterns, primarily due to human activities like burning fossil fuels.\nQ: What are the main causes of climate change?\nA: The main causes include burning fossil fuels, deforestation, industrial processes, and some agricultural practices.\nQ: How does burning fossil fuels contribute to climate change?\nA: Burning fossil fuels releases large amounts of carbon dioxide (CO2) and other greenhouse gases into the atmosphere, trapping heat and causing global temperatures to rise.\nQ: What are greenhouse gases?\nA: Greenhouse gases are gases that can trap heat in the Earth's atmosphere, such as carbon dioxide, methane, nitrous oxide, and fluorinated gases.\nQ: How does deforestation impact climate change?\nA: Deforestation reduces the number of trees that can absorb CO2, increasing the concentration of CO2 in the atmosphere.\nQ: What are the effects of climate change on the environment?\nA: Ef

---
The training logic is to take the 'current' word to be input and the 'next' word to be the label of input. <br>
### Strategy:
1. Hi my name is Soham
2. I live in BBSR<br>

|Input (text -> vector)| Output |
|---|---|
|Hi|my|
|Hi my| name|
|Hi my name|is|
|Hi my name is|Soham|
|I|live|
|I live|in|
|I live in|BBSR|


---
## Text preprocessing


In [118]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts([content])
tokenizer.word_index

{'and': 1,
 'climate': 2,
 'a': 3,
 'change': 4,
 'to': 5,
 'q': 6,
 'the': 7,
 'of': 8,
 'is': 9,
 'what': 10,
 'can': 11,
 'in': 12,
 'how': 13,
 'carbon': 14,
 'energy': 15,
 'are': 16,
 'does': 17,
 'greenhouse': 18,
 'weather': 19,
 'reduce': 20,
 'for': 21,
 'reducing': 22,
 'include': 23,
 'gases': 24,
 'renewable': 25,
 'footprint': 26,
 'impacts': 27,
 'sustainable': 28,
 'global': 29,
 'that': 30,
 'on': 31,
 'affect': 32,
 'emissions': 33,
 'patterns': 34,
 'some': 35,
 'practices': 36,
 'co2': 37,
 'such': 38,
 'as': 39,
 'ecosystems': 40,
 'from': 41,
 'by': 42,
 'water': 43,
 'fossil': 44,
 'fuels': 45,
 'causes': 46,
 'contribute': 47,
 'atmosphere': 48,
 'rise': 49,
 'impact': 50,
 'support': 51,
 'sources': 52,
 'help': 53,
 'international': 54,
 'extreme': 55,
 'loss': 56,
 'food': 57,
 'waste': 58,
 'leading': 59,
 'increased': 60,
 'adapt': 61,
 'temperatures': 62,
 'human': 63,
 'burning': 64,
 'heat': 65,
 'sea': 66,
 'events': 67,
 'through': 68,
 'do': 69,
 'com

In [119]:
input_sequences = []
output_sequences = []
for sentence in content.split('\n'):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    print("This is the seq:", tokenized_sentence)
    
    for ind, token in enumerate(tokenized_sentence):
        if ind == 0:
            continue
        n_gram = tokenized_sentence[:ind]
        input_sequences.append(n_gram)
        
        output = tokenized_sentence[ind]
        output_sequences.append(output)
        print(n_gram, output)

This is the seq: [6, 10, 9, 2, 4]
[6] 10
[6, 10] 9
[6, 10, 9] 2
[6, 10, 9, 2] 4
This is the seq: [3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1, 19, 34, 197, 82, 5, 63, 120, 83, 64, 44, 45]
[3] 2
[3, 2] 4
[3, 2, 4] 194
[3, 2, 4, 194] 5
[3, 2, 4, 194, 5] 195
[3, 2, 4, 194, 5, 195] 196
[3, 2, 4, 194, 5, 195, 196] 119
[3, 2, 4, 194, 5, 195, 196, 119] 12
[3, 2, 4, 194, 5, 195, 196, 119, 12] 62
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62] 1
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1] 19
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1, 19] 34
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1, 19, 34] 197
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1, 19, 34, 197] 82
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1, 19, 34, 197, 82] 5
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1, 19, 34, 197, 82, 5] 63
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1, 19, 34, 197, 82, 5, 63] 120
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1, 19, 34, 197, 82, 5, 63, 120] 83
[3, 2, 4, 194, 5, 195, 196, 119, 12, 62, 1, 19, 34, 197, 82, 5, 63, 120, 83]

In [120]:
## Padding to be done
sent_len = max([len(x) for x in input_sequences]) # 31

In [121]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(input_sequences, maxlen=sent_len, padding='pre')

In [122]:
## Creating the final dataset after tokenizing
X = padded_sequences
y = output_sequences

---
## Model Selection
#### Regression:
* Words created are discrete and hence, if we use regression model, there are high chances the output will give words that don't exist in our vocabulary

#### Classification:
* Best, as the words to be generated are treated to be `discrete`
* We will use `Multiclass` classification

In [123]:
voc_size = len(tokenizer.word_index) + 1 ## as the indexing in the tokenizer starts from 1, we add 1 to include '0'
voc_size

470

In [124]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=voc_size)
y.shape

(1409, 470)

---
# Architecture

In [125]:
# Embedding -> LSTM -> Dense(softmax)

from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

model = Sequential()

model.add(Embedding(voc_size, 100, input_shape=(31,)))
model.add(LSTM(150))
model.add(Dense(voc_size, activation='softmax'))

model.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)
print(model.summary())

  super().__init__(**kwargs)


None


In [126]:
model.fit(X,y, epochs=100)

Epoch 1/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.0386 - loss: 5.9834
Epoch 2/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.0533 - loss: 5.3414
Epoch 3/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.1011 - loss: 5.2047
Epoch 4/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.1099 - loss: 4.9616
Epoch 5/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.1292 - loss: 4.9143
Epoch 6/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.1394 - loss: 4.7289
Epoch 7/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.1616 - loss: 4.4610
Epoch 8/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.1980 - loss: 4.2701
Epoch 9/100
[1m45/45[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x318d60890>

In [137]:
import numpy as np
text = "greenhouse"

# tokenize
def tokenize(text):
    return tokenizer.texts_to_sequences([text])[0]
# padding
def padding(text):
    token_text = tokenize(text)
    return pad_sequences([token_text], maxlen=31, padding='pre')
# predict
def prediction(text, num_of_words):
    for i in range(num_of_words):
        pos = np.argmax(model.predict(padding(text)))
        for word, index in tokenizer.word_index.items():
            if index == pos:
                text = text + " " + word
                print(text)

prediction(text, 10)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
greenhouse gases
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
greenhouse gases are
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
greenhouse gases are gases
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
greenhouse gases are gases that
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
greenhouse gases are gases that can
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
greenhouse gases are gases that can trap
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
greenhouse gases are gases that can trap heat
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
greenhouse gases are gases that can trap heat in
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
greenhouse gases are gases that can trap heat in the
[1m1/1[0m [32m━━━━━━━━━━━━━