In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("projjal1/human-conversation-training-data")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/human-conversation-training-data


In [None]:
file_path = path + "/human_chat.txt"
with open(file_path, 'r') as f:
    file_content = f.readlines()
data=''
for line in file_content:
  data+=line[9:]


In [None]:
import numpy as np

In [None]:
data

'Hi!\nWhat is your favorite holiday?\none where I get to meet lots of different people.\nWhat was the most number of people you have ever met during a holiday?\nHard to keep a count. Maybe 25.\nWhich holiday was that?\nI think it was Australia\nDo you still talk to the people you met?\nNot really. The interactions are usually short-lived but it\'s fascinating to learn where people are coming from and what matters to them\nYea, me too. I feel like God often puts strangers in front of you, and gives you an opportunity to connect with them in that moment in deeply meaningful ways. Do you ever feel like you know things about strangers without them telling you?\nwhat do you mean?\nI think it\'s like a 6th sense, often seen as "cold readings" to people, but can be remarkably accurate. I once sat next to a man in a coffee and I felt a pain in my back. I asked the stranger if he had a pain. It turns out that he did in the exact spot, and said he pulled a muscle while dancing at a party. I had 

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])

In [None]:
word_index=tokenizer.word_index #words are indexed
total_words=len(word_index)+1
print(total_words)

2809


In [None]:
input_sequences=[]  #input x
output=[]           #output y
for sentense in data.split('\n'):
  tokenised_sentense=tokenizer.texts_to_sequences([sentense])[0]
  for i in range(1,len(tokenised_sentense)):
    n_gram_sequence=tokenised_sentense[:i+1]

    input_sequences.append(n_gram_sequence[:-1])
    output.append(n_gram_sequence[-1])


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_sequence_len=max([len(x) for x in input_sequences])    #maximum is 150
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))

In [None]:
from keras.utils import to_categorical
categorical_output=to_categorical(output,num_classes=total_words)
print(len(categorical_output))

18070


In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

model = Sequential()
# Input Layer
model.add(Embedding(
    input_dim=total_words,
    output_dim=64,  # Increased embedding dimension
    # input_length=max_sequence_len - 1,  # Corrected input length - Removed deprecated argument
    mask_zero=True              # Ignore padding indices
))

# LSTM Layer 1 wrapped in Bidirectional with regularization
model.add(Bidirectional(
    LSTM(
        units=128, # Reduced units
        return_sequences=False,
        dropout=0.3,
        recurrent_dropout=0.2,
        kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4),
        recurrent_regularizer=l1_l2(l1=1e-5, l2=1e-4)
    )
))

model.add(Dense(32,activation='relu'))

# Output Layer with regularization
model.add(Dense(
    units=total_words,       # Match vocabulary size
    activation='softmax',

))

model.compile(
    loss='categorical_crossentropy',  # Efficient for large vocab
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
model.build(input_shape=(None, max_sequence_len-1))
model.summary()

In [None]:

model.fit(input_sequences,categorical_output, epochs=20, validation_split=0.2, batch_size=256)

Epoch 1/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 2s/step - accuracy: 0.0295 - loss: 7.7918 - val_accuracy: 0.0277 - val_loss: 6.7790
Epoch 2/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 2s/step - accuracy: 0.0281 - loss: 6.4887 - val_accuracy: 0.0277 - val_loss: 6.7373
Epoch 3/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.0308 - loss: 6.3135 - val_accuracy: 0.0310 - val_loss: 6.7335
Epoch 4/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.0395 - loss: 6.2381 - val_accuracy: 0.0296 - val_loss: 6.6931
Epoch 5/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.0382 - loss: 6.1654 - val_accuracy: 0.0423 - val_loss: 6.6590
Epoch 6/20
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.0381 - loss: 6.0880 - val_accuracy: 0.0418 - val_loss: 6.7123
Epoch 7/20
[1m57/57[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7c3318dba9f0>

In [None]:
model.fit(input_sequences,categorical_output, epochs=20, validation_split=0.2, batch_size=256)

Epoch 21/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.1179 - loss: 5.1076 - val_accuracy: 0.1060 - val_loss: 7.2250
Epoch 22/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.1269 - loss: 5.0352 - val_accuracy: 0.1093 - val_loss: 7.2387
Epoch 23/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.1257 - loss: 4.9890 - val_accuracy: 0.1151 - val_loss: 7.3030
Epoch 24/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.1295 - loss: 4.9410 - val_accuracy: 0.1148 - val_loss: 7.3155
Epoch 25/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.1341 - loss: 4.8875 - val_accuracy: 0.1168 - val_loss: 7.4150
Epoch 26/50
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.1346 - loss: 4.8303 - val_accuracy: 0.1140 - val_loss: 7.5711
Epoch 27/50
[1m57/57[0m [

<keras.src.callbacks.history.History at 0x7c32442753d0>

In [None]:
test_sentense=input()
for i in range(10):
  tokenised_sentense=tokenizer.texts_to_sequences([test_sentense])[0]
  input_test_sequence=np.array(pad_sequences([tokenised_sentense],maxlen=max_sequence_len-1,padding='pre'))
  predicted_word=model.predict(input_test_sequence)
  predicted_word=np.argmax(predicted_word)
  for word,index in tokenizer.word_index.items():
    if index==predicted_word:
      test_sentense+=" "+word
      print(word)
      break

print(test_sentense)


hi
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
i
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
am
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
a
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
lot
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
of
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
work
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
but
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
i
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
think
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
to
hi i am a lot of work but i think to


In [None]:
import matplotlib.pyplot as plt
plt.plot(model.history.history['accuracy'])
plt.plot(model.history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

TypeError: 'History' object is not subscriptable

In [None]:
!pip install tensorflow