In [7]:
import pandas as pd
import numpy as np
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
data.shape

(5572, 5)

In [9]:
label_distribution = data['v1'].value_counts()
label_distribution

ham     4825
spam     747
Name: v1, dtype: int64

In [10]:
non_null_counts = data[['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']].notnull().sum()
non_null_values = data[data['Unnamed: 2'].notnull() | data['Unnamed: 3'].notnull() | data['Unnamed: 4'].notnull()][['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']]
non_null_counts, non_null_values.head()


(Unnamed: 2    50
 Unnamed: 3    12
 Unnamed: 4     6
 dtype: int64,
                                             Unnamed: 2             Unnamed: 3  \
 95                                         PO Box 5249   MK17 92H. 450Ppw 16"   
 281   the person is definitely special for u..... B...       why to miss them   
 444   HOWU DOIN? FOUNDURSELF A JOBYET SAUSAGE?LOVE ...                    NaN   
 671   wanted to say hi. HI!!!\" Stop? Send STOP to ...                    NaN   
 710    this wont even start........ Datz confidence.."                    NaN   
 
                          Unnamed: 4  
 95                              NaN  
 281   just Keep-in-touch\" gdeve.."  
 444                             NaN  
 671                             NaN  
 710                             NaN  )

In [11]:
from collections import Counter


def compute_tf(text):
    words = text.split()
    total_words = len(words)
    word_counts = Counter(words)
    tf = {word: count / total_words for word, count in word_counts.items()}

    return tf

data['tf'] = data['v2'].apply(compute_tf)
data[['v2', 'tf']].head()


Unnamed: 0,v2,tf
0,"Go until jurong point, crazy.. Available only ...","{'Go': 0.05, 'until': 0.05, 'jurong': 0.05, 'p..."
1,Ok lar... Joking wif u oni...,"{'Ok': 0.16666666666666666, 'lar...': 0.166666..."
2,Free entry in 2 a wkly comp to win FA Cup fina...,"{'Free': 0.03571428571428571, 'entry': 0.07142..."
3,U dun say so early hor... U c already then say...,"{'U': 0.18181818181818182, 'dun': 0.0909090909..."
4,"Nah I don't think he goes to usf, he lives aro...","{'Nah': 0.07692307692307693, 'I': 0.0769230769..."


In [12]:
from sklearn.feature_extraction.text import CountVectorizer  
subset_data = data['v2'].head(100)
vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(subset_data)
tf_df = pd.DataFrame(word_counts.toarray(), columns=vectorizer.get_feature_names())
tf_df = tf_df.divide(tf_df.sum(axis=1), axis=0)

tf_df.head()


Unnamed: 0,000,07732584351,0800,08000930705,08002986030,08452810075over18,09061209465,09061701461,09066364589,10,...,yesterday,yo,you,your,yours,yourself,yummy,yup,ì_,ì¼1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
tf_df.shape

(100, 724)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])
tfidf_data = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names())
tfidf_data.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,ó_,û_,û_thanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, data['v1'], test_size=0.2, random_state=42)

lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)

y_pred = lr_classifier.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, conf_matrix


(0.9623318385650225,
 array([[965,   0],
        [ 42, 108]]))

In [16]:
def extract_top_terms(message_index, tfidf_matrix, feature_names, N=5):
    row_data = tfidf_matrix[message_index].toarray().flatten()

    top_indices = row_data.argsort()[-N:][::-1]

    top_terms = [(feature_names[i], row_data[i]) for i in top_indices]

    return top_terms

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])


feature_names = tfidf_vectorizer.get_feature_names()

top_terms_data = {i: extract_top_terms(i, tfidf_matrix, feature_names) for i in range(10)}

top_terms_data


{0: [('jurong', 0.3264252905795869),
  ('amore', 0.3264252905795869),
  ('buffet', 0.3116082237740733),
  ('bugis', 0.2757654045621182),
  ('cine', 0.2757654045621182)],
 1: [('oni', 0.5465881710238072),
  ('joking', 0.5236458071582338),
  ('wif', 0.4316010362639011),
  ('lar', 0.4082988561907181),
  ('ok', 0.27211951321382544)],
 2: [('fa', 0.46025256453051905),
  ('entry', 0.3527103027641593),
  ('08452810075over18', 0.23012628226525952),
  ('21st', 0.2223624014303424),
  ('2005', 0.2223624014303424)],
 3: [('say', 0.588532244886041),
  ('hor', 0.48845710205212745),
  ('early', 0.3528609993425001),
  ('dun', 0.3250496221664022),
  ('already', 0.293626081506221)],
 4: [('he', 0.4317278554771633),
  ('lives', 0.38590759651744017),
  ('nah', 0.34795073992273934),
  ('usf', 0.3437619549300075),
  ('goes', 0.3065400844986221)],
 5: [('chgs', 0.3058662925725672),
  ('rcv', 0.2919824379160986),
  ('tb', 0.27449084967991694),
  ('darling', 0.26824783161338317),
  ('std', 0.24451322531066766)

In [18]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

n_topics = 5

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(data['v2'])


lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(doc_term_matrix)

top_terms_per_topic = {}
for topic_idx, topic in enumerate(lda.components_):
    top_features = topic.argsort()[-10:][::-1]
    top_terms = [vectorizer.get_feature_names()[i] for i in top_features]
    top_terms_per_topic[topic_idx] = top_terms

top_terms_per_topic


{0: ['ll',
  'later',
  'ok',
  'got',
  'sorry',
  'know',
  'home',
  'ur',
  'want',
  'text'],
 1: ['going',
  'lor',
  'come',
  'today',
  'claim',
  'wat',
  'good',
  'prize',
  'da',
  'just'],
 2: ['day',
  'just',
  'need',
  'yes',
  'love',
  'stop',
  'good',
  'free',
  'happy',
  'babe'],
 3: ['gt', 'lt', 'ur', 'ok', 'just', 'know', 'sent', 'good', 'send', 'text'],
 4: ['free',
  'pls',
  'number',
  'ur',
  'nokia',
  'new',
  'time',
  'just',
  'tone',
  'lol']}

In [19]:
data = data[['v1', 'v2']]
data.columns = ['label', 'message']
missing_values = data.isnull().sum()

missing_values

label      0
message    0
dtype: int64

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=5000)


tfidf_features = tfidf_vectorizer.fit_transform(data['message'])

tfidf_features.shape


(5572, 3956)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Assuming 'data' is your DataFrame and 'text' and 'label' are column names

# Create a TfidfVectorizer instance
tfidf_vectorizer = TfidfVectorizer()

# Generate TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(data['text'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, data['label'], test_size=0.2, random_state=42)

# Initialize the SVM with a linear kernel
svm_linear = SVC(kernel='linear', C=1)

# Train the model
svm_linear.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_linear.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, output_dict=True)

# Convert classification report into DataFrame
classification_report_df = pd.DataFrame(classification_rep).transpose()

# Output accuracy and classification report DataFrame
accuracy, classification_report_df


(0.979372197309417,
               precision    recall  f1-score      support
 ham            0.978659  0.997927  0.988199   965.000000
 spam           0.984733  0.860000  0.918149   150.000000
 accuracy       0.979372  0.979372  0.979372     0.979372
 macro avg      0.981696  0.928964  0.953174  1115.000000
 weighted avg   0.979476  0.979372  0.978775  1115.000000)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Re-load the dataset since the previous environment was reset
data_path = 'spam.csv'
data = pd.read_csv(data_path, encoding='latin-1')
data = data[['v1', 'v2']]  # We keep only the necessary columns
data.columns = ['label', 'text']  # Rename the columns for convenience

# Initialize a Bag of Words vectorizer
bow_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=5000)

# Fit and transform the data
bow_features = bow_vectorizer.fit_transform(data['text'])

# Split the data into training and testing sets
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_features, data['label'], test_size=0.2, random_state=42)

# Initialize the SVM with a linear kernel
svm_linear_bow = SVC(kernel='linear', C=1)

# Train the model
svm_linear_bow.fit(X_train_bow, y_train_bow)

# Make predictions on the test set
y_pred_bow = svm_linear_bow.predict(X_test_bow)

# Calculate accuracy
accuracy_bow = accuracy_score(y_test_bow, y_pred_bow)
# Generate classification report
classification_rep_bow = classification_report(y_test_bow, y_pred_bow, output_dict=True)

# Convert classification report into DataFrame
classification_report_bow_df = pd.DataFrame(classification_rep_bow).transpose()

accuracy_bow, classification_report_bow_df


(0.979372197309417,
               precision    recall  f1-score      support
 ham            0.980612  0.995855  0.988175   965.000000
 spam           0.970370  0.873333  0.919298   150.000000
 accuracy       0.979372  0.979372  0.979372     0.979372
 macro avg      0.975491  0.934594  0.953737  1115.000000
 weighted avg   0.979234  0.979372  0.978909  1115.000000)

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'data' is your DataFrame and 'text' and 'label' are column names

# Load the dataset (if not already loaded)
data_path = 'spam.csv'  # Update this path if necessary
data = pd.read_csv(data_path, encoding='latin-1')
data = data[['v1', 'v2']]  # Keep only the necessary columns
data.columns = ['label', 'text']  # Rename the columns for convenience

# Encode the labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Generate TF-IDF features (assuming this step is needed here)
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(data['text'])

# Instantiate the scaler
scaler = StandardScaler(with_mean=False)  # Set with_mean to False for sparse matrix compatibility

# Create an instance of TruncatedSVD
svd = TruncatedSVD(n_components=100)

# Apply SVD on the TF-IDF features
svd_features = svd.fit_transform(tfidf_features)

# Scale the data after applying SVD
svd_features_scaled = scaler.fit_transform(svd_features)

# Split the data into training and testing sets
X_train_svd, X_test_svd, y_train_svd, y_test_svd = train_test_split(svd_features_scaled, data['label'], test_size=0.2, random_state=42)

# Initialize and train the SVM model
svm_linear_svd = SVC(kernel='linear', C=1)
svm_linear_svd.fit(X_train_svd, y_train_svd)

# Predict on the test set
y_pred_svd = svm_linear_svd.predict(X_test_svd)

# Evaluate the model's performance
accuracy_svd = accuracy_score(y_test_svd, y_pred_svd)
classification_rep_svd = classification_report(y_test_svd, y_pred_svd, output_dict=True)

# Convert classification report into DataFrame
classification_report_svd_df = pd.DataFrame(classification_rep_svd).transpose()

# Output the accuracy and classification report DataFrame
accuracy_svd, classification_report_svd_df


(0.968609865470852,
               precision    recall  f1-score     support
 0              0.979381  0.984456  0.981912   965.00000
 1              0.896552  0.866667  0.881356   150.00000
 accuracy       0.968610  0.968610  0.968610     0.96861
 macro avg      0.937967  0.925561  0.931634  1115.00000
 weighted avg   0.968238  0.968610  0.968384  1115.00000)

In [36]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD

# Assuming 'data' is your DataFrame with 'text' and 'label' columns
# and necessary preprocessing steps (like label encoding) have been done

# Feature extraction for each model (TF-IDF, BoW, and SVD)
tfidf_vectorizer = TfidfVectorizer()
bow_vectorizer = CountVectorizer()
svd_transformer = TruncatedSVD(n_components=100)
scaler = StandardScaler(with_mean=False)

tfidf_features = tfidf_vectorizer.fit_transform(data['text'])
bow_features = bow_vectorizer.fit_transform(data['text'])
svd_features = scaler.fit_transform(svd_transformer.fit_transform(tfidf_features))

# Split the data into training and testing sets for each feature set
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, data['label'], test_size=0.2, random_state=42)
X_train_bow, X_test_bow, _ , _ = train_test_split(bow_features, data['label'], test_size=0.2, random_state=42)
X_train_svd, X_test_svd, _ , _ = train_test_split(svd_features, data['label'], test_size=0.2, random_state=42)

# Train SVM models
svm_linear = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm_linear_bow = SVC(kernel='linear', C=1).fit(X_train_bow, y_train)
svm_linear_svd = SVC(kernel='linear', C=1).fit(X_train_svd, y_train)

# Make predictions using all three models
y_pred_tfidf = svm_linear.predict(X_test)
y_pred_bow = svm_linear_bow.predict(X_test_bow)
y_pred_svd = svm_linear_svd.predict(X_test_svd)

# Ensemble predictions using majority voting
ensemble_predictions = []
for tfidf, bow, svd in zip(y_pred_tfidf, y_pred_bow, y_pred_svd):
    votes = [tfidf, bow, svd]
    majority_vote = max(set(votes), key=votes.count)
    ensemble_predictions.append(majority_vote)

# Evaluate the ensemble model's performance
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
ensemble_classification_report = classification_report(y_test, ensemble_predictions)

ensemble_accuracy, ensemble_classification_report


(0.979372197309417,
 '              precision    recall  f1-score   support\n\n           0       0.98      1.00      0.99       965\n           1       0.98      0.87      0.92       150\n\n    accuracy                           0.98      1115\n   macro avg       0.98      0.93      0.95      1115\nweighted avg       0.98      0.98      0.98      1115\n')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Compute the decision function scores for each model
score_tfidf = svm_linear.decision_function(X_test)
score_bow = svm_linear_bow.decision_function(X_test_bow)
score_svd = svm_linear_svd_array.decision_function(X_test_svd_array)

# Compute FPR, TPR, and AUC for each model
fpr_tfidf, tpr_tfidf, _ = roc_curve(y_test, score_tfidf, pos_label='spam')
fpr_bow, tpr_bow, _ = roc_curve(y_test_bow, score_bow, pos_label='spam')
fpr_svd, tpr_svd, _ = roc_curve(y_test_svd_array, score_svd, pos_label='spam')
roc_auc_tfidf = auc(fpr_tfidf, tpr_tfidf)
roc_auc_bow = auc(fpr_bow, tpr_bow)
roc_auc_svd = auc(fpr_svd, tpr_svd)

In [None]:
# Function to plot ROC curve for a given model
def plot_roc_curve(fpr, tpr, roc_auc, model_name):
    plt.figure(figsize=(7, 5))
    plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic (ROC) Curve - {model_name}')
    plt.legend(loc="lower right")
    plt.show()

# Plot ROC curve for each model
plot_roc_curve(fpr_tfidf, tpr_tfidf, roc_auc_tfidf, "TF-IDF")
plot_roc_curve(fpr_bow, tpr_bow, roc_auc_bow, "BoW")
plot_roc_curve(fpr_svd, tpr_svd, roc_auc_svd, "SVD")


In [2]:
# Compute ensemble scores based on the number of models predicting "spam"
ensemble_scores = []
for tfidf, bow, svd in zip(score_tfidf, score_bow, score_svd):
    score = sum([1 if model_score > 0 else 0 for model_score in [tfidf, bow, svd]])
    ensemble_scores.append(score)

# Compute FPR, TPR, and AUC for the ensemble
fpr_ensemble, tpr_ensemble, _ = roc_curve(y_test, ensemble_scores, pos_label='spam')
roc_auc_ensemble = auc(fpr_ensemble, tpr_ensemble)

# Plot the ROC curve for the ensemble
plot_roc_curve(fpr_ensemble, tpr_ensemble, roc_auc_ensemble, "Ensemble")


NameError: name 'score_tfidf' is not defined

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize your text data using the existing tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['message'])

# Find the number of unique words (vocabulary size)
num_words = len(tokenizer.word_index) + 1  # +1 accounts for the special padding or out-of-vocabulary token
num_words



NameError: name 'data' is not defined

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping

bow_vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=5000)

# Fit and transform the data
bow_features = bow_vectorizer.fit_transform(data['message'])

# Split the data into training and testing sets
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_features.toarray(), data['label'], test_size=0.2, random_state=42)

# Encoding the labels ('ham' and 'spam') into numerical format (0 and 1)
label_encoder = LabelEncoder()

# Fit and transform the labels for both training and test data
y_train_bow = label_encoder.fit_transform(y_train_bow)
y_test_bow = label_encoder.transform(y_test_bow)

# Convert the encoded labels into float type for compatibility with TensorFlow/Keras
y_train_bow = y_train_bow.astype(float)
y_test_bow = y_test_bow.astype(float)
num_words = 5000  # Example value, adjust based on your data
sequence_length = 3956
# Define the neural network model with attention
input_text = Input(shape=(sequence_length,))  # Adjust sequence_length based on your data
embedding_layer = Embedding(input_dim=num_words, output_dim=100)(input_text)  # Adjust embedding_dim
attention_layer = Attention()([embedding_layer, embedding_layer])  # Apply self-attention
pooled_attention = GlobalAveragePooling1D()(attention_layer)  # Pooling layer to summarize the attention weights

# Further processing with dense layers
dense1 = Dense(128, activation='relu')(pooled_attention)
dense2 = Dense(64, activation='relu')(dense1)

# Output layer with sigmoid activation function for binary classification
output = Dense(1, activation='sigmoid')(dense2)

# Define the model
model = Model(inputs=input_text, outputs=output)

from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_bow), y=y_train_bow)
class_weight_dict = dict(enumerate(class_weights))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(
    X_train_bow, y_train_bow, 
    epochs=50,  # Increased the number of epochs
    batch_size=32, 
    class_weight=class_weight_dict, 
    validation_split=0.1, 
    callbacks=[early_stopping]
)



# Make predictions on the test set
predictions = model.predict(X_test_bow).ravel()

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test_bow, predictions)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Calculate the confusion matrix to check false negatives and false positives
conf_matrix = confusion_matrix(y_test_bow, (predictions > 0.5).astype(int))
tn, fp, fn, tp = conf_matrix.ravel()

print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")


In [None]:
import os
# Set the GPU you want to use (e.g., "0" or "1" or "0,1" for multiple GPUs)
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
