In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D

# Load the dataset
file_path = '/content/dataset.tsv'
data = pd.read_csv(file_path, delimiter='\t')

# Constants
max_len = 100  # Maximum length of text sequences
max_words = 10000  # Maximum number of words to consider in the vocabulary

# Split the data into features and labels
X = data['text'].values
y_binary = data['label'].values  # Binary classification labels
y_category = data['Pattern Category'].values  # Multi-class classification labels

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len)

# Encode the multi-class labels
label_encoder = LabelEncoder()
y_category_encoded = label_encoder.fit_transform(y_category)

# Split the data
X_train, X_test, y_binary_train, y_binary_test, y_category_train, y_category_test = train_test_split(X_pad, y_binary, y_category_encoded, test_size=0.2)

# Model Creation
input_text = Input(shape=(max_len,))
embedding = Embedding(max_words, 50)(input_text)
lstm = LSTM(64, return_sequences=True)(embedding)
global_max_pool = GlobalMaxPooling1D()(lstm)
binary_pred = Dense(1, activation='sigmoid', name='binary_output')(global_max_pool)  # Binary classification output
category_dense = Dense(64, activation='relu')(global_max_pool)
category_pred = Dense(len(label_encoder.classes_), activation='softmax', name='category_output')(category_dense)  # Multi-class classification output

model = Model(inputs=input_text, outputs=[binary_pred, category_pred])

# Compile the model
model.compile(optimizer='adam',
              loss={'binary_output': 'binary_crossentropy', 'category_output': 'sparse_categorical_crossentropy'},
              metrics=['accuracy'])

# Model Training
model.fit(X_train, {'binary_output': y_binary_train, 'category_output': y_category_train}, epochs=15, validation_split=0.2)

# Evaluation
model.evaluate(X_test, {'binary_output': y_binary_test, 'category_output': y_category_test})


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


[0.7564239501953125,
 0.21794772148132324,
 0.5384761691093445,
 0.9216101765632629,
 0.8622881174087524]

In [None]:
model.save('Dark_Pattern_buster.h5')

  saving_api.save_model(
