**Dataset Loading**

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("/content/spam_ham_dataset.csv")

# Exploratory Data Analysis (EDA)
print("Data Shape:", data.shape)
print("Columns:", data.columns)
print("Sample Data:")
print(data.head())

Data Shape: (5171, 4)
Columns: Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')
Sample Data:
   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   

   label_num  
0          0  
1          0  
2          0  
3          1  
4          0  


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Data Cleaning**

In [11]:
data.drop_duplicates(inplace=True)
# Handle missing values
data.dropna(inplace=True)
# Text Cleaning
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords and apply stemming
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

**Data Spliting**

In [12]:
data['cleaned_text'] = data['text'].apply(clean_text)
# Feature Engineering
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['cleaned_text'])
# Encoding target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Building the TensorFlow Model& Model Evalution**

In [30]:
import pandas as pd
import numpy as np
import scipy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("/content/spam_ham_dataset.csv")

# Text preprocessing
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the TensorFlow model
model = Sequential([
    Dense(64, input_shape=(X_train.shape[1],), activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
print("Type of X_train:", type(X_train))
print("Type of y_train:", type(y_train))
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
if type(X_train) == scipy.sparse.csr.csr_matrix:
    X_train = X_train.toarray()

if type(y_train) == np.ndarray:
    y_train = y_train.reshape(-1, 1)
model.fit(X_train, y_train, epochs=10, batch_size=32)

Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of y_train: <class 'numpy.ndarray'>
Shape of X_train: (4136, 50447)
Shape of y_train: (4136,)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78a2896b1390>

In [34]:
X_test = scipy.sparse.csr_matrix(X_test.toarray())
y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, mean_absolute_error

if type(y_test) == type(y_pred) == np.ndarray and np.isin(y_test, [0, 1]).all() and np.isin(y_pred, [0, 1]).all():
    accuracy = accuracy_score(y_test, y_pred)
elif type(y_test) == np.ndarray and np.isin(y_test, [0, 1]).all() and not np.isin(y_pred, [0, 1]).all():
    accuracy = roc_auc_score(y_test, y_pred)
else:
    accuracy = mean_squared_error(y_test, y_pred)

print("Test Accuracy:", accuracy)

Test Accuracy: 0.9994526370017387


**Analyzing Model Coefficients**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("spam_ham_dataset.csv")

# Text preprocessing
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the TensorFlow model
model = Sequential([
    Dense(64, input_shape=(X_train.shape[1],), activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
print("Type of X_train:", type(X_train))
print("Type of y_train:", type(y_train))
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
if type(X_train) == scipy.sparse.csr.csr_matrix:
    X_train = X_train.toarray()

if type(y_train) == np.ndarray:
    y_train = y_train.reshape(-1, 1)
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Extracting the weights of the first layer
weights_first_layer = model.layers[0].get_weights()[0]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
print("Shape of X_train:", X_train.shape)
print("Shape of weights_first_layer:", weights_first_layer.shape)
if type(X_train) == scipy.sparse.csr.csr_matrix:
    X_train = X_train.toarray()
if type(y_train) == np.ndarray:
    y_train = y_train.reshape(-1, 1)
weights_first_layer = weights_first_layer.ravel()
plt.figure(figsize=(10, 6))
plt.bar(range(len(weights_first_layer)), weights_first_layer.ravel())
plt.xlabel('Feature Index')
plt.ylabel('Weight')
plt.title('Importance of Features (First Layer)')
plt.show()

Shape of X_train: (4136, 50447)
Shape of weights_first_layer: (50447, 64)
