In [36]:
!pip uninstall tensorflow -y


Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0


In [37]:
!pip install tensorflow


Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.17,>=2.16 (from tensorflow)
  Downloading tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCo

In [1]:
import tensorflow as tf
print(tf.__version__)


2.16.1


In [3]:
!pip install keras



In [7]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# Load the dataset
df = pd.read_csv("Dataset.csv", header=None)

In [10]:
# Renaming columns
df.columns = ['email_type', 'content']


In [11]:
# Check for invalid label values
invalid_labels = df[~df['email_type'].isin(range(1, 19))]
if not invalid_labels.empty:
    print("Invalid label values found:")
    print(invalid_labels)

Invalid label values found:
   email_type content
0           0       1


In [12]:
# Remove rows with invalid label values
df = df[df['email_type'].isin(range(1, 19))]

In [13]:
# Data preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters (retain numbers)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

df['content'] = df['content'].apply(preprocess_text)

In [14]:
# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['content'])
X_seq = tokenizer.texts_to_sequences(df['content'])
max_length = 500
X_pad = pad_sequences(X_seq, maxlen=max_length, padding='post')


In [15]:
# Update labels
y = df['email_type'].values - 1  # Convert labels to 0-based indices

In [16]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [18]:
# Define the model
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(18, activation='softmax'))


In [19]:
# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [21]:
# Train the model
early_stop = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)
history = model.fit(X_train, y_train, epochs=45, batch_size=32, validation_data=(X_test, y_test),
                    callbacks=[early_stop, checkpoint])

Epoch 1/45
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 3s/step - accuracy: 0.1882 - loss: 2.7358 - val_accuracy: 0.6056 - val_loss: 1.5809
Epoch 2/45
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 3s/step - accuracy: 0.6735 - loss: 1.2027 - val_accuracy: 0.9444 - val_loss: 0.3690
Epoch 3/45
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 2s/step - accuracy: 0.9415 - loss: 0.3288 - val_accuracy: 0.9639 - val_loss: 0.1637
Epoch 4/45
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 2s/step - accuracy: 0.9839 - loss: 0.1114 - val_accuracy: 0.9639 - val_loss: 0.1156
Epoch 5/45
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 2s/step - accuracy: 0.9869 - loss: 0.0744 - val_accuracy: 0.9722 - val_loss: 0.0909
Epoch 6/45
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 3s/step - accuracy: 0.9926 - loss: 0.0427 - val_accuracy: 0.9694 - val_loss: 0.1022
Epoch 7/45
[1m45/45[0m [32m━━━━

In [22]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 604ms/step - accuracy: 0.9800 - loss: 0.0644
Test Loss: 0.09080126881599426, Test Accuracy: 0.9722222089767456


In [23]:
# Save tokenizer and model
model.save("email_classification_model.h5")
tokenizer_json = tokenizer.to_json()
with open("tokenizer.json", "w") as json_file:
    json_file.write(tokenizer_json)




ValueError: Unable to synchronously create dataset (name already exists)