<a href="https://colab.research.google.com/github/rahulbedjavalge/columbia-ml-course/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Input, Dense, Flatten, Conv2D, MaxPooling2D, Activation
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
import numpy as np

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
full_data = fetch_20newsgroups()

In [None]:
my_categories = ['sci.space','sci.med']

In [None]:
del(full_data)

In [None]:
data = fetch_20newsgroups(categories=my_categories)

In [None]:
X = data['data']

In [None]:
y = data['target']

In [None]:
import re

In [None]:
def cleaning_text(text):
    clean_text = []
    for email in text:
        clean_text.append(re.findall('(?u)\\b\\w\\w+\\b', email))
    return clean_text

In [None]:
clean_X = cleaning_text(X)
print(f'number of emails: {len(clean_X)} number if words in emails: {len(clean_X[0])}')

number of emails: 1187 number if words in emails: 91


In [None]:
vocab_list = []
for email in clean_X:
    for word in email:
        vocab_list.append(word)

vocab_list = list(set(vocab_list))
len(vocab_list)

31696

In [None]:
word_to_num = {}
num_to_word = {}

for i, word in enumerate(vocab_list):
    num_to_word[i+1] = word
    word_to_num[word] = i+1


In [None]:
word_to_num

{'do': 1,
 'PM10': 2,
 'Greenville': 3,
 'Ciba': 4,
 'composites': 5,
 'partway': 6,
 'extruded': 7,
 'flora': 8,
 'Dinner': 9,
 'industrially': 10,
 'ekcolor': 11,
 'Rutherford': 12,
 'Sternberg': 13,
 'Lab': 14,
 'Hobday': 15,
 'immature': 16,
 'GETS': 17,
 'tooke': 18,
 'zwarte': 19,
 'feel': 20,
 '4620': 21,
 'Derive': 22,
 'fd': 23,
 'SYang': 24,
 'thoughts': 25,
 'employee': 26,
 'astronaut_733694515': 27,
 '42211': 28,
 'Nutrasweet': 29,
 'articulated': 30,
 'unsubscribed': 31,
 'sterren': 32,
 'Burroughs': 33,
 'REVENUES': 34,
 'sbrun': 35,
 'incompatible': 36,
 'Doctoral': 37,
 'unauthorized': 38,
 'delab': 39,
 'npm': 40,
 'pulling': 41,
 'inordinate': 42,
 'INFO': 43,
 'gabe': 44,
 'MNRAS': 45,
 'operation': 46,
 'Dismutase': 47,
 'supports': 48,
 'sasghm': 49,
 'p17': 50,
 'dumbells': 51,
 'PH': 52,
 'Range': 53,
 'antiquated': 54,
 'investigated': 55,
 '03': 56,
 '1K9': 57,
 'PROPONENT': 58,
 'Mazur': 59,
 '642': 60,
 'judith': 61,
 'Cherkauer': 62,
 'Europeenne': 63,
 'te

In [None]:
word_vecs= [[word_to_num[word]for word in email]for email in clean_X]



In [None]:
max_len = 0
position = 0
for i, email in enumerate(clean_X):
    if len(email) > max_len:
        max_len = len(email)
        position = i
print(f"length of longest email = {max_len}, in email number ={position}")

length of longest email = 5937, in email number =1169


In [None]:
pad_word_vecs = sequence.pad_sequences(word_vecs,
                                       maxlen=max_len,
                                       padding='pre')

In [None]:
pad_word_vecs[1168]

array([    0,     0,     0, ..., 31252,  3689, 24292], dtype=int32)

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(pad_word_vecs,y)

In [None]:
model = Sequential()

In [None]:
Xtrain.shape

(890, 5937)

In [None]:
model.add(Embedding(len(vocab_list),16, input_length=max_len))



In [None]:
model.add(LSTM(64))

In [None]:
model.add(Dense(1, activation ='softmax'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(Xtrain, ytrain, epochs=10, batch_size=10)

Epoch 1/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 135ms/step - accuracy: 0.5008 - loss: 0.0615
Epoch 2/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 134ms/step - accuracy: 0.5384 - loss: 0.0596
Epoch 3/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 145ms/step - accuracy: 0.5040 - loss: 0.0224
Epoch 4/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 135ms/step - accuracy: 0.5027 - loss: 0.0075
Epoch 5/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 134ms/step - accuracy: 0.5143 - loss: 0.0201
Epoch 6/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 135ms/step - accuracy: 0.5072 - loss: 0.0140
Epoch 7/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 135ms/step - accuracy: 0.5033 - loss: 5.3423e-04
Epoch 8/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 135ms/step - accuracy: 0.5132 - loss: 0.0165
Epoch 9/10
[1m89/89[0m [3

<keras.src.callbacks.history.History at 0x7fd31d402150>

In [None]:
loss, accuracy = model.evaluate(Xtest, ytest, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")



Test Loss: 0.2669
Test Accuracy: 0.4882


In [None]:
model.save("my_model.h5")




In [None]:
model = load_model('my_model.h5')


