In [None]:
!pip install -q kaggle

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download priyamchoksi/1-million-reddit-jokes-rjokes

Dataset URL: https://www.kaggle.com/datasets/priyamchoksi/1-million-reddit-jokes-rjokes
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading 1-million-reddit-jokes-rjokes.zip to /content
  0% 0.00/92.9M [00:00<?, ?B/s]
100% 92.9M/92.9M [00:00<00:00, 1.06GB/s]


In [None]:
!unzip 1-million-reddit-jokes-rjokes.zip -d reddit_jokes

Archive:  1-million-reddit-jokes-rjokes.zip
  inflating: reddit_jokes/one-million-reddit-jokes.csv  


In [None]:
import pandas as pd
import re

In [None]:
df = pd.read_csv("/content/reddit_jokes/one-million-reddit-jokes.csv")
df.head()

Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score
0,post,ftbp1i,2qh72,jokes,False,1585785543,https://old.reddit.com/r/Jokes/comments/ftbp1i...,self.jokes,,My corona is covered with foreskin so it is no...,I am soooo glad I'm not circumcised!,2
1,post,ftboup,2qh72,jokes,False,1585785522,https://old.reddit.com/r/Jokes/comments/ftboup...,self.jokes,,It's called Google Sheets.,Did you know Google now has a platform for rec...,9
2,post,ftbopj,2qh72,jokes,False,1585785508,https://old.reddit.com/r/Jokes/comments/ftbopj...,self.jokes,,The vacuum doesn't snore after sex.\n\n&amp;#x...,What is the difference between my wife and my ...,15
3,post,ftbnxh,2qh72,jokes,False,1585785428,https://old.reddit.com/r/Jokes/comments/ftbnxh...,self.jokes,,[removed],My last joke for now.,9
4,post,ftbjpg,2qh72,jokes,False,1585785009,https://old.reddit.com/r/Jokes/comments/ftbjpg...,self.jokes,,[removed],The Nintendo 64 turns 18 this week...,134


In [None]:
print(df.columns)

Index(['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw',
       'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title',
       'score'],
      dtype='object')


In [None]:
df['selftext'] = df['selftext'].fillna('')

df['text'] = df['title'] + ' ' + df['selftext']
df = df[['text']]

df.head()

Unnamed: 0,text
0,I am soooo glad I'm not circumcised! My corona...
1,Did you know Google now has a platform for rec...
2,What is the difference between my wife and my ...
3,My last joke for now. [removed]
4,The Nintendo 64 turns 18 this week... [removed]


In [None]:
df = df[df['text'].str.strip() != '']

In [None]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^a-z0-9\s]', '', text)
  text = re.sub(r'\s+', ' ', text)
  text = text.strip()
  return text

In [None]:
df['clean_text'] = df['text'].apply(clean_text)
df.sample(5)

Unnamed: 0,text,clean_text
633779,Orgre ale is superior to other beers For Shrek...,orgre ale is superior to other beers for shrek...
723086,Mariah Carey bombs at NYC New Years celebratio...,mariah carey bombs at nyc new years celebratio...
210360,The 6th grade science teacher asks her class a...,the 6th grade science teacher asks her class a...
779218,You mom is so fat [removed],you mom is so fat removed
883446,Donald Trump is visiting a school [removed],donald trump is visiting a school removed


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np
import glob

In [None]:
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(df['clean_text'])
print(len(tokenizer.word_index))
print(tokenizer.word_index)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

193708


In [None]:
input_sequences = []

for idx, sen in enumerate(df['clean_text']):
  tokenized_sen = tokenizer.texts_to_sequences([sen])[0]

  for i in range(1, len(tokenized_sen)):
    input_sequences.append(tokenized_sen[:i+1])

  if idx % 1000 ==0 and idx>0:
    np.save(f"seq_batch_{idx}.npy", np.array(input_sequences, dtype =object))
    print(f"saved batch: {idx}")
    input_sequences = []

saved batch: 1000
saved batch: 2000
saved batch: 3000
saved batch: 4000
saved batch: 5000
saved batch: 6000
saved batch: 7000
saved batch: 8000
saved batch: 9000
saved batch: 10000
saved batch: 11000
saved batch: 12000
saved batch: 13000
saved batch: 14000
saved batch: 15000
saved batch: 16000
saved batch: 17000
saved batch: 18000
saved batch: 19000
saved batch: 20000
saved batch: 21000
saved batch: 22000
saved batch: 23000
saved batch: 24000
saved batch: 25000
saved batch: 26000
saved batch: 27000
saved batch: 28000
saved batch: 29000
saved batch: 30000
saved batch: 31000
saved batch: 32000
saved batch: 33000
saved batch: 34000
saved batch: 35000
saved batch: 36000
saved batch: 37000
saved batch: 38000
saved batch: 39000
saved batch: 40000
saved batch: 41000
saved batch: 42000
saved batch: 43000
saved batch: 44000
saved batch: 45000
saved batch: 46000
saved batch: 47000
saved batch: 48000
saved batch: 49000
saved batch: 50000
saved batch: 51000
saved batch: 52000
saved batch: 53000
sa

In [None]:
files = sorted(glob.glob("seq_batch_*.npy"))

In [None]:
max_seq_len =0;
for file in files:
  data = np.load(file, allow_pickle = True)
  for seq in data:
      max_seq_len = max(max_seq_len, len(seq))

print(max_seq_len)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

7576
193708


In [None]:
model = Sequential()

model.add(Embedding(vocab_size, 128))
# model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150, dropout =0.3))
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [None]:
import gc

In [None]:
for file in files:
  print(f"\n1 processing batch: {file}")
  sequences = np.load(file, allow_pickle=True)
  print(f"\n2 processing batch: {file}")

  padded_sequence = pad_sequences(sequences, maxlen= max_seq_len, padding= 'pre')
  print(f"\n3 processing batch: {file}")

  x = padded_sequence[:, :-1]
  y = padded_sequence[:, -1]

  print(f"\n4 processing batch: {file}")

  model.fit(x,y, epochs =1, batch_size=25)
  print(f"\n5 processing batch: {file}")

  del x, y, padded_sequence, sequences
  gc.collect()
  print(f"\n6 processing batch: {file}")



1 processing batch: seq_batch_1000.npy

2 processing batch: seq_batch_1000.npy

3 processing batch: seq_batch_1000.npy

4 processing batch: seq_batch_1000.npy
[1m  67/1097[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m5:49:12[0m 20s/step - accuracy: 0.0343 - loss: 11.4646

KeyboardInterrupt: 

In [None]:
# for file in files:
#   print(f"\n1 processing batch: {file}")
#   sequences = np.load(file, allow_pickle=True)

#   padded_sequence = pad_sequences(sequences, maxlen= max_seq_len, padding= 'pre')

#   x = padded_sequence[:, :-1]
#   y = padded_sequence[:, -1]

#   del x, y, padded_sequence, sequences
#   gc.collect()
#   print(f"\n1 processing batch: {file}, deleted successfully")


1 processing batch: seq_batch_1000.npy

1 processing batch: seq_batch_1000.npy, deleted successfully

1 processing batch: seq_batch_10000.npy

1 processing batch: seq_batch_10000.npy, deleted successfully

1 processing batch: seq_batch_100000.npy

1 processing batch: seq_batch_100000.npy, deleted successfully

1 processing batch: seq_batch_101000.npy

1 processing batch: seq_batch_101000.npy, deleted successfully

1 processing batch: seq_batch_102000.npy

1 processing batch: seq_batch_102000.npy, deleted successfully

1 processing batch: seq_batch_103000.npy

1 processing batch: seq_batch_103000.npy, deleted successfully

1 processing batch: seq_batch_104000.npy

1 processing batch: seq_batch_104000.npy, deleted successfully

1 processing batch: seq_batch_105000.npy

1 processing batch: seq_batch_105000.npy, deleted successfully

1 processing batch: seq_batch_106000.npy

1 processing batch: seq_batch_106000.npy, deleted successfully

1 processing batch: seq_batch_107000.npy

1 process

In [26]:
print(model.summary())

None
