In [7]:
# Import the required libraries
# Import the required libraries
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from imblearn.under_sampling import RandomUnderSampler


# Load the preprocessed data
data1=pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=100)

# Undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Calculate class weights
class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

#class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)



# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


Epoch 1/20
Epoch 2: val_loss improved from 0.39962 to 0.39810, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39810
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39810
Epoch 5/20
Epoch 5: val_loss did not improve from 0.39810
Epoch 5: early stopping
Test accuracy: 0.8238193988800049


In [42]:
# Load the pickled file
with open('/kaggle/input/models/Lstm_with_tuning_1.pkl', 'rb') as f:
    l_model = pickle.load(f)

Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2023-04-22 06:17:14         3345
variables.h5                                   2023-04-22 06:17:14     15682264
metadata.json                                  2023-04-22 06:17:14           64
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......dense
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......embedding
.........vars
............0
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
......lstm_1
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.

In [60]:
# Load the pickled file
with open('/kaggle/input/models-b/biLstm_without_tuning.pkl', 'rb') as f:
    b_model = pickle.load(f)

Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2023-04-22 12:00:12         3636
variables.h5                                   2023-04-22 12:00:12     16040536
metadata.json                                  2023-04-22 12:00:12           64
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_1
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............ce

In [62]:
y_predb=b_model.predict(x_test3)



In [78]:
testba1=pd.read_csv('/kaggle/input/unbalanced/testu.csv')

In [79]:
testba1.dropna(inplace=True)

In [80]:
# Define the columns for which to make predictions
cols_to_predict = [col for col in testba1.columns if col not in ['comment_text', 'processed_comment_text', 'comment_text_processed','severe_toxicity','obscene','sexual_explicit','identity_attack','insult','threat','other','gender','religion','race','disability']]

# Add predicted values to the test dataset
for i, col in enumerate(cols_to_predict):
    testba1[col + '_pred'] = y_pred_ba[:, i]

# Export the test dataset with predicted values to a CSV file
testba1.to_csv('test_with_predictions_decimals_bilstm_attention_single.csv', index=False)

In [81]:
pred_ba=pd.read_csv('/kaggle/working/test_with_predictions_decimals_bilstm_attention_single.csv')

In [82]:
pred_ba.head()

Unnamed: 0,comment_text,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,other,gender,religion,race,disability,Target,processed_comment_text,comment_text_processed,Target_pred
0,So between the 2 civil lawyers going for the j...,0,0,0,0,0,0,1,0,0,0,0,0,"['civil', 'lawyer', 'going', 'job', 'one', 'st...",civil lawyer going job one stellar reputation ...,0.065515
1,Hope they have bullet proof glass and bomb bar...,0,0,0,0,0,0,1,0,0,0,0,0,"['hope', 'bullet', 'proof', 'glass', 'bomb', '...",hope bullet proof glass bomb barrier well armed,0.488264
2,"""...They realize the inter-connectedness betwe...",0,0,0,0,0,0,1,0,0,0,0,0,"['they', 'realize', 'interconnectedness', 'nat...",they realize interconnectedness nation world n...,0.043872
3,"I'm a Raider fan, but I agree with Finley. Th...",0,0,0,0,0,0,1,0,0,0,0,0,"['raider', 'fan', 'agree', 'finley', 'these', ...",raider fan agree finley these player sit anthe...,0.203256
4,I voted for Trump and it was not for any reaso...,0,0,0,0,0,0,1,0,0,0,0,0,"['voted', 'trump', 'reason', 'article', 'faceb...",voted trump reason article facebook what mains...,0.07317


In [48]:
test4

Unnamed: 0,comment_text,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,other,gender,religion,race,disability,Target,processed_comment_text,comment_text_processed,Target_pred
0,So between the 2 civil lawyers going for the j...,0,0,0,0,0,0,1,0,0,0,0,0,"['civil', 'lawyer', 'going', 'job', 'one', 'st...",civil lawyer going job one stellar reputation ...,0.203585
1,Hope they have bullet proof glass and bomb bar...,0,0,0,0,0,0,1,0,0,0,0,0,"['hope', 'bullet', 'proof', 'glass', 'bomb', '...",hope bullet proof glass bomb barrier well armed,0.358292
2,"""...They realize the inter-connectedness betwe...",0,0,0,0,0,0,1,0,0,0,0,0,"['they', 'realize', 'interconnectedness', 'nat...",they realize interconnectedness nation world n...,0.029618
3,"I'm a Raider fan, but I agree with Finley. Th...",0,0,0,0,0,0,1,0,0,0,0,0,"['raider', 'fan', 'agree', 'finley', 'these', ...",raider fan agree finley these player sit anthe...,0.499466
4,I voted for Trump and it was not for any reaso...,0,0,0,0,0,0,1,0,0,0,0,0,"['voted', 'trump', 'reason', 'article', 'faceb...",voted trump reason article facebook what mains...,0.020921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134051,Of course the flyer was male. Males are bette...,0,0,0,0,0,0,1,1,0,0,0,0,"['course', 'flyer', 'male', 'males', 'better',...",course flyer male males better suicide female ...,0.857142
134052,you are spot on - i wouldn't want to be a dec...,0,0,0,0,0,0,1,0,0,0,0,0,"['spot', 'would', 'nt', 'want', 'decider', 'pe...",spot would nt want decider people cop would ha...,0.313334
134053,Sorry - but I think the DNC has already establ...,0,0,0,0,0,0,1,0,0,0,0,0,"['sorry', 'think', 'dnc', 'already', 'establis...",sorry think dnc already established position t...,0.818399
134054,"Hi, Amira...been ages since your last column, ...",0,0,0,0,0,0,1,0,0,0,0,0,"['amira', 'age', 'since', 'last', 'column', 'p...",amira age since last column post truth environ...,0.280629


In [8]:
test1= pd.read_csv('/kaggle/input/unbalanced/testu.csv')

In [10]:
test1.dropna(inplace=True)

In [11]:
x_test=test1['comment_text_processed']

In [12]:
y_test=test1['Target']

In [13]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_test)
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=100)

In [14]:
model.evaluate(x_test,y_test)



[0.7243576049804688, 0.678726315498352]

In [16]:
# Save your model
import pickle
filename = 'Lstm_without_tuning.pkl'
pickle.dump(model, open(filename, 'wb'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......embedding
.........vars
............0
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
......lstm_1
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........17
.........18
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-04-22 05:3

In [37]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from imblearn.under_sampling import RandomUnderSampler


# Load the preprocessed data
#data1=pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
#data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Calculate class weights
class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the LSTM model
model1 = Sequential()
model1.add(Embedding(input_dim=10000, output_dim=128))
model1.add(LSTM(32, return_sequences=True))
model1.add(Dropout(0.5))
#model.add(LSTM(32, return_sequences=True))
model1.add(Dropout(0.5))
model1.add(LSTM(16))
model1.add(Dropout(0.5))
model1.add(Dense(1, activation='sigmoid'))

# Compile the model
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model1.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model1.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model1.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


Epoch 1/20
Epoch 2: val_loss improved from 0.39984 to 0.39835, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39835
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39835
Epoch 5/20
Epoch 5: val_loss did not improve from 0.39835
Epoch 5: early stopping
Test accuracy: 0.8255447149276733


In [27]:
test3= pd.read_csv('/kaggle/input/unbalanced/testu.csv')

In [28]:
test3.dropna(inplace=True)

In [29]:
x_test2=test3['comment_text_processed']

In [30]:
y_test2=test3['Target']

In [31]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_test2)
x_test2 = tokenizer.texts_to_sequences(x_test2)
x_test2 = pad_sequences(x_test2, maxlen=100)

In [39]:
model1.evaluate(x_test3,y_test3)



[0.7149592041969299, 0.6761287450790405]

In [36]:
# Save your model
import pickle
filename = 'Lstm_with_tuning.pkl'
pickle.dump(model, open(filename, 'wb'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......dropout_2
.........vars
......embedding
.........vars
............0
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
......lstm_1
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........17
.........18
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                     

In [34]:
# Import the required libraries
# Import the required libraries
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from imblearn.under_sampling import RandomUnderSampler


# Load the preprocessed data
#data1=pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
#data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Calculate class weights
class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

#class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)



# Build the LSTM model
model1 = Sequential()
model1.add(Embedding(input_dim=10000, output_dim=128))
model1.add(LSTM(32, return_sequences=True))
model1.add(Dropout(0.5))
model1.add(LSTM(16))
model1.add(Dropout(0.5))
model1.add(Dense(1, activation='sigmoid'))

# Compile the model
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model1.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model1.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model1.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.39612, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.39612
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39612
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39612
Epoch 4: early stopping
Test accuracy: 0.8284038305282593


In [3]:
test4= pd.read_csv('/kaggle/input/unbalanced/testu.csv')

In [4]:
test4.dropna(inplace=True)

In [5]:
x_test3=test4['comment_text_processed']

In [6]:
y_test3=test4['Target']

In [7]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_test3)
x_test3 = tokenizer.texts_to_sequences(x_test3)
x_test3 = pad_sequences(x_test3, maxlen=145)


In [40]:
model1.evaluate(x_test3,y_test3)



[0.6547689437866211, 0.7034103870391846]

In [41]:
#predict
y_pred=model1.predict(x_test3)



In [49]:
#predict
model.evaluate(x_test3,y_test3)



[0.6783618330955505, 0.6947071552276611]

In [43]:
# Calculate F1 score
from sklearn.metrics import f1_score
f1 = f1_score(y_test3, y_pred.round(), average='macro')
print('F1 score:', f1)

F1 score: 0.5290231841155956


In [47]:
# Calculate F1 score
from sklearn.metrics import f1_score
f1 = f1_score(y_test3, y_pred1.round(), average='macro')
print('F1 score:', f1)

F1 score: 0.5302414300997712


In [50]:
# Save your model
import pickle
filename = 'Lstm_with_tuning_1.pkl'
pickle.dump(model1, open(filename, 'wb'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......embedding
.........vars
............0
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
......lstm_1
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........17
.........18
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-04-22 06:1

In [85]:
# Import the required libraries
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from imblearn.under_sampling import RandomUnderSampler


# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Download the GloVe word embeddings
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove.6B.zip

# Parse the GloVe word embeddings file
embedding_dict = {}
with open('glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = vector
f.close()

# Create an embedding matrix for the tokenizer
num_words = min(10000, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, 100))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Calculate class weights
class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the LSTM model
model3 = Sequential()
model3.add(Embedding(num_words, 100, weights=[embedding_matrix], input_length=145, trainable=False))
model3.add(LSTM(32, return_sequences=True))
model3.add(Dropout(0.5))
model3.add(LSTM(32, return_sequences=True))
model3.add(Dropout(0.5))
model3.add(LSTM(16))
model3.add(Dropout(0.5))
model3.add(Dense(1, activation='sigmoid'))

# Compile the model
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model3.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model3.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model3.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.44366, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss improved from 0.44366 to 0.41172, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss improved from 0.41172 to 0.40369, saving model to best_model.h5
Epoch 4/20
Epoch 4: val_loss improved from 0.40369 to 0.39885, saving model to best_model.h5
Epoch 5/20
Epoch 5: val_loss improved from 0.39885 to 0.39632, saving model to best_model.h5
Epoch 6/20
Epoch 6: val_loss did not improve from 0.39632
Epoch 7/20
Epoch 7: val_loss improved from 0.39632 to 0.39351, saving model to best_model.h5
Epoch 8/20
Epoch 8: val_loss improved from 0.39351 to 0.39254, saving model to best_model.h5
Epoch 9/20
Epoch 9: val_loss improved from 0.39254 to 0.39119, saving model to best_model.h5
Epoch 10/20
Epoch 10: val_loss did not improve from 0.39119
Epoch 11/20
Epoch 11: val_loss did not improve from 0.39119
Epoch 12/20
Epoch 12: val_loss did not improve from 0.39119
Epoch 12: early stoppi

In [None]:
# Import the required libraries
# Load pre-trained GloVe embeddings
#from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from gensim.models import KeyedVectors

from imblearn.under_sampling import RandomUnderSampler

# Download pre-trained GloVe embeddings
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove.6B.zip -d glove

# Load the preprocessed data
data1=pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=300)


glove_model = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False, no_header=True)


# Create an embedding matrix for the tokenizer vocabulary
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 300))
for word, i in tokenizer.word_index.items():
    if word in glove_model:
        embedding_matrix[i] = glove_model[word]

# Undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Calculate class weights
class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the LSTM model with pre-trained GloVe embeddings
model3 = Sequential()
model3.add(Embedding(len(tokenizer.word_index) + 1, 300, weights=[embedding_matrix], input_length=300, trainable=False))
model3.add(LSTM(128, return_sequences=True))
model3.add(Dropout(0.5))
model3.add(LSTM(64))
model3.add(Dropout(0.5))
model3.add(Dense(1, activation='sigmoid'))

# Compile the model
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model3.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model3.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model3.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.40235, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss improved from 0.40235 to 0.39319, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss improved from 0.39319 to 0.39099, saving model to best_model.h5
Epoch 4/20
Epoch 4: val_loss improved from 0.39099 to 0.39056, saving model to best_model.h5
Epoch 5/20
Epoch 5: val_loss did not improve from 0.39056
Epoch 6/20
Epoch 6: val_loss did not improve from 0.39056
Epoch 7/20
   3/2536 [..............................] - ETA: 1:48 - loss: 0.2594 - accuracy: 0.8958

In [5]:
test7= pd.read_csv('/kaggle/input/unbalanced/testu.csv')
test7.dropna(inplace=True)

In [6]:
x_test6=test7['comment_text_processed']

In [7]:
y_test6=test7['Target']

In [8]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_test6)
x_test6 = tokenizer.texts_to_sequences(x_test6)
x_test6 = pad_sequences(x_test6, maxlen=145)

In [23]:
model.evaluate(x_test6,y_test6)



[0.6751114130020142, 0.7115090489387512]

In [126]:
# Import the required libraries
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_class_weight
from imblearn.under_sampling import RandomUnderSampler

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Calculate class weights
class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model
model2 = Sequential()
model2.add(Embedding(input_dim=10000, output_dim=128))
model2.add(Bidirectional(LSTM(256, return_sequences=True)))
model2.add(Dropout(0.5))
model2.add(Bidirectional(LSTM(128,return_sequences=True)))
model2.add(Dropout(0.5))
model2.add(Bidirectional(LSTM(64)))
model2.add(Dense(1, activation='sigmoid'))

# Compile the model
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model2.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model2.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model2.load_weights('best_model2.h5')

# Evaluate the model on the testing set
loss, accuracy = model2.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.40280, saving model to best_model2.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.40280
Epoch 3/20
Epoch 3: val_loss did not improve from 0.40280
Epoch 4/20
Epoch 4: val_loss did not improve from 0.40280
Epoch 4: early stopping
Test accuracy: 0.8228827714920044


In [58]:
# Import the required libraries
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
#smote = SMOTE(random_state=42)
rus = RandomUnderSampler(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model
model2 = Sequential()
model2.add(Embedding(input_dim=10000, output_dim=128))
model2.add(Bidirectional(LSTM(32, return_sequences=True)))
model2.add(Dropout(0.5))
#model2.add(Bidirectional(LSTM(16, return_sequences=True)))
#model2.add(Dropout(0.5))
model2.add(Bidirectional(LSTM(16)))
model2.add(Dropout(0.5))
model2.add(Dense(1, activation='sigmoid'))

# Compile the model
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model2.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model2.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model2.load_weights('best_model2.h5')

# Evaluate the model on the testing set
loss, accuracy = model2.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.40654, saving model to best_model2.h5
Epoch 2/20
Epoch 2: val_loss improved from 0.40654 to 0.40521, saving model to best_model2.h5
Epoch 3/20
Epoch 3: val_loss did not improve from 0.40521
Epoch 4/20
Epoch 4: val_loss did not improve from 0.40521
Epoch 5/20
Epoch 5: val_loss did not improve from 0.40521
Epoch 5: early stopping
Test accuracy: 0.8250517845153809


In [59]:
model2.evaluate(x_test3,y_test3)



[0.7503971457481384, 0.6873549222946167]

In [18]:

# Save your model
import pickle
filename = 'biLstm_without_tuning.pkl'
pickle.dump(model2, open(filename, 'wb'))


Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_1
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......dense
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......embedding
.........vars
............0
...metrics

In [17]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model with attention layer
inputs = Input(shape=(145,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(inputs)
lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))(embedding_layer)
dropout_layer1 = Dropout(0.5)(lstm_layer1)
lstm_layer2 = Bidirectional(LSTM(16, return_sequences=True))(dropout_layer1)
dropout_layer2 = Dropout(0.5)(lstm_layer2)
attention = Dense(1, activation='relu')(dropout_layer2)
attention = Flatten()(attention)
attention = Activation('softmax')(attention)
attention = RepeatVector(32)(attention)
attention = Permute([2, 1])(attention)
weighted = Multiply()([dropout_layer2, attention])
output_layer = Dense(1, activation='sigmoid')(weighted)

model = Model(inputs=inputs, outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.57035, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss improved from 0.57035 to 0.49731, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss improved from 0.49731 to 0.45897, saving model to best_model.h5
Epoch 4/20
Epoch 4: val_loss improved from 0.45897 to 0.45594, saving model to best_model.h5
Epoch 5/20
Epoch 5: val_loss improved from 0.45594 to 0.44733, saving model to best_model.h5
Epoch 6/20
Epoch 6: val_loss improved from 0.44733 to 0.43868, saving model to best_model.h5
Epoch 7/20
Epoch 7: val_loss did not improve from 0.43868
Epoch 8/20
Epoch 8: val_loss improved from 0.43868 to 0.43540, saving model to best_model.h5
Epoch 9/20
Epoch 9: val_loss did not improve from 0.43540
Epoch 10/20
Epoch 10: val_loss did not improve from 0.43540
Epoch 11/20
Epoch 11: val_loss did not improve from 0.43540
Epoch 11: early stopping
Test accuracy: 0.8140367269515991


In [1]:
!pip install keras-self-attention


Collecting keras-self-attention
  Downloading keras-self-attention-0.51.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25ldone
[?25h  Created wheel for keras-self-attention: filename=keras_self_attention-0.51.0-py3-none-any.whl size=18913 sha256=016e1c364377e04e7eebe0483d806b3b6923bbf2958e7143a43d9736207e2605
  Stored in directory: /root/.cache/pip/wheels/cb/26/00/2d79e29156bddf85b6c2bccecf43fcb024fb935e3d7a933684
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.51.0
[0m

In [22]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Dropout
from keras.layers import Embedding, GlobalAveragePooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the transformer model with self-attention
inputs = Input(shape=(145,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(inputs)
lstm_layer1 = Dropout(0.5)(embedding_layer)
lstm_layer2 = SeqSelfAttention(attention_activation='sigmoid')(lstm_layer1)
pooling_layer = GlobalAveragePooling1D()(lstm_layer2)
output_layer = Dense(1, activation='sigmoid')(pooling_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.46652, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss improved from 0.46652 to 0.43226, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss did not improve from 0.43226
Epoch 4/20
Epoch 4: val_loss did not improve from 0.43226
Epoch 5/20
Epoch 5: val_loss did not improve from 0.43226
Epoch 5: early stopping
Test accuracy: 0.8172138333320618


In [26]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model with attention layer
inputs = Input(shape=(145,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(inputs)
lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))(embedding_layer)
attention = SeqSelfAttention(attention_activation='relu')(lstm_layer1)
dropout_layer1 = Dropout(0.5)(attention)
lstm_layer2 = Bidirectional(LSTM(16))(dropout_layer1)
dropout_layer2 = Dropout(0.5)(lstm_layer2)
pooling_layer = GlobalAveragePooling1D()(attention)
output_layer = Dense(1, activation='sigmoid')(pooling_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.40114, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss improved from 0.40114 to 0.40084, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss did not improve from 0.40084
Epoch 4/20
Epoch 4: val_loss did not improve from 0.40084
Epoch 5/20
Epoch 5: val_loss did not improve from 0.40084
Epoch 5: early stopping
Test accuracy: 0.8256433010101318


In [27]:
model.evaluate(x_test6,y_test6)



[0.6820387244224548, 0.7000440359115601]

In [29]:

import pickle
filename = 'biLstm+attention_without_tuning.pkl'
pickle.dump(model, open(filename, 'wb'))


Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......dense
.........vars
............0
............1
......embedding
.........vars
............0
......global_average_pooling1d
.........vars
......input_layer
.........vars
......seq_self_attention
.........vars
............0
............1
............2
............3
............4
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16


In [31]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model with attention layer
# Build the bidirectional LSTM model with attention layer
inputs = Input(shape=(145,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(inputs)
lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))(embedding_layer)
attention = SeqSelfAttention(attention_activation='relu')(lstm_layer1)
dropout_layer1 = Dropout(0.5)(attention)
lstm_layer2 = Bidirectional(LSTM(16))(dropout_layer1)
dropout_layer2 = Dropout(0.5)(lstm_layer2)
pooling_layer = GlobalAveragePooling1D()(attention)
batch_norm_layer = BatchNormalization()(pooling_layer)
output_layer = Dense(1, activation='sigmoid')(batch_norm_layer)

model4 = Model(inputs=inputs, outputs=output_layer)

# Compile the model
model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model4.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model4.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model4.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.40129, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss improved from 0.40129 to 0.39823, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39823
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39823
Epoch 5/20
Epoch 5: val_loss did not improve from 0.39823
Epoch 5: early stopping
Test accuracy: 0.8247559666633606


In [34]:
model.evaluate(x_test6,y_test6)



[0.6810097694396973, 0.7067468762397766]

In [72]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, Reshape
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model with attention layer
inputs = Input(shape=(145,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(inputs)
lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))(embedding_layer)
attention = SeqSelfAttention(attention_activation='relu')(lstm_layer1)
dropout_layer1 = Dropout(0.5)(attention)
lstm_layer2 = Bidirectional(LSTM(16, return_sequences=True))(dropout_layer1)
dropout_layer2 = Dropout(0.5)(lstm_layer2)
lstm_layer3 = Bidirectional(LSTM(8))(dropout_layer2)
dropout_layer3 = Dropout(0.5)(lstm_layer3)
flatten_layer = Flatten()(dropout_layer3)
reshaped_layer = Reshape((1, 16))(flatten_layer)
pooling_layer = GlobalAveragePooling1D()(reshaped_layer)
output_layer = Dense(1, activation='sigmoid')(pooling_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.39482, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.39482
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39482
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39482
Epoch 4: early stopping
Test accuracy: 0.8246080875396729


In [73]:
model.evaluate(x_test3,y_test3)



[0.6582249402999878, 0.7163906097412109]

In [74]:
y_pred_ba=model.predict(x_test3)



In [36]:
import pickle
filename = 'biLstm+attention_with_tuning_1.pkl'
pickle.dump(model, open(filename, 'wb'))


Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_1
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_2
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
......

In [42]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, GlobalAveragePooling1D, BatchNormalization, Reshape
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model with attention layer
inputs = Input(shape=(145,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(inputs)
lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))(embedding_layer)
attention = SeqSelfAttention(attention_activation='relu')(lstm_layer1)
dropout_layer1 = Dropout(0.5)(attention)
lstm_layer2 = Bidirectional(LSTM(16))(dropout_layer1)
dropout_layer2 = Dropout(0.5)(lstm_layer2)
reshape_layer = Reshape((1, 32))(dropout_layer2)
pooling_layer = GlobalAveragePooling1D()(reshape_layer)
batch_norm_layer = BatchNormalization()(pooling_layer)
output_layer = Dense(1, activation='sigmoid')(batch_norm_layer)

model6 = Model(inputs=inputs, outputs=output_layer)

# Compile the model
model6.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model6.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict,callbacks=[early_stop, model_checkpoint])

# Load the best model
model6.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss,accuracy = model6.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.40246, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.40246
Epoch 3/20
Epoch 3: val_loss did not improve from 0.40246
Epoch 4/20
Epoch 4: val_loss did not improve from 0.40246
Epoch 4: early stopping
Test accuracy: 0.8216010928153992


In [43]:
model6.evaluate(x_test6,y_test6)



[0.7686564922332764, 0.6310749053955078]

In [38]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, Reshape, GlobalAveragePooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model with attention layer
inputs = Input(shape=(145,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(inputs)
lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))(embedding_layer)
attention = SeqSelfAttention(attention_activation='relu')(lstm_layer1)
dropout_layer1 = Dropout(0.5)(attention)
lstm_layer2 = Bidirectional(LSTM(16, return_sequences=True))(dropout_layer1)
dropout_layer2 = Dropout(0.5)(lstm_layer2)
lstm_layer3 = Bidirectional(LSTM(8, return_sequences=True))(dropout_layer2)
dropout_layer3 = Dropout(0.5)(lstm_layer3)
pooling_layer = GlobalAveragePooling1D()(dropout_layer3)
output_layer = Dense(1, activation='sigmoid')(pooling_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.39595, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.39595
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39595
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39595
Epoch 4: early stopping
Test accuracy: 0.8232278227806091


In [10]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, Reshape, GlobalAveragePooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model with attention layer
inputs = Input(shape=(145,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(inputs)
lstm_layer1 = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
attention = SeqSelfAttention(attention_activation='relu')(lstm_layer1)
dropout_layer1 = Dropout(0.5)(attention)
lstm_layer2 = Bidirectional(LSTM(16, return_sequences=True))(dropout_layer1)
dropout_layer2 = Dropout(0.5)(lstm_layer2)
lstm_layer3 = Bidirectional(LSTM(8, return_sequences=True))(dropout_layer2)
dropout_layer3 = Dropout(0.5)(lstm_layer3)
pooling_layer = GlobalAveragePooling1D()(dropout_layer3)
output_layer = Dense(1, activation='sigmoid')(pooling_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.39317, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.39317
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39317
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39317
Epoch 4: early stopping
Test accuracy: 0.8266292214393616


In [11]:
model.evaluate(x_test6,y_test6)



[0.6494514346122742, 0.6945503950119019]

In [40]:
import pickle
filename = 'biLstm+attention_with_tuning_2.pkl'
pickle.dump(model, open(filename, 'wb'))


Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_1
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_2
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
......

In [44]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, Reshape, GlobalAveragePooling1D, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the bidirectional LSTM model with attention layer
inputs = Input(shape=(145,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(inputs)
lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))(embedding_layer)
bn_layer1 = BatchNormalization()(lstm_layer1)
attention = SeqSelfAttention(attention_activation='relu')(bn_layer1)
dropout_layer1 = Dropout(0.5)(attention)
lstm_layer2 = Bidirectional(LSTM(16, return_sequences=True))(dropout_layer1)
bn_layer2 = BatchNormalization()(lstm_layer2)
dropout_layer2 = Dropout(0.5)(bn_layer2)
lstm_layer3 = Bidirectional(LSTM(8, return_sequences=True))(dropout_layer2)
bn_layer3 = BatchNormalization()(lstm_layer3)
dropout_layer3 = Dropout(0.5)(bn_layer3)
pooling_layer = GlobalAveragePooling1D()(dropout_layer3)
output_layer = Dense(1, activation='sigmoid')(pooling_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy= model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.40993, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.40993
Epoch 3/20
Epoch 3: val_loss did not improve from 0.40993
Epoch 4/20
Epoch 4: val_loss did not improve from 0.40993
Epoch 4: early stopping
Test accuracy: 0.8237208127975464


In [46]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, Reshape, GlobalAveragePooling1D, concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the Siamese BiLSTM model with attention layer
embedding_layer = Embedding(input_dim=10000, output_dim=128)
lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))
attention = SeqSelfAttention(attention_activation='relu')
dropout_layer1 = Dropout(0.5)
lstm_layer2 = Bidirectional(LSTM(16, return_sequences=True))
dropout_layer2 = Dropout(0.5)
lstm_layer3 = Bidirectional(LSTM(8, return_sequences=True))
dropout_layer3 = Dropout(0.5)
pooling_layer = GlobalAveragePooling1D()

input1 = Input(shape=(145,))
input2 = Input(shape=(145,))

encoded1 = dropout_layer1(attention(lstm_layer1(embedding_layer(input1))))
encoded2 = dropout_layer1(attention(lstm_layer1(embedding_layer(input2))))

merged_layer = concatenate([encoded1, encoded2], axis=-1)

merged_layer = lstm_layer2(merged_layer)
merged_layer = dropout_layer2(merged_layer)

merged_layer = lstm_layer3(merged_layer)
merged_layer = dropout_layer3(merged_layer)

merged_layer = pooling_layer(merged_layer)

output_layer = Dense(1, activation='sigmoid')(merged_layer)

model = Model(inputs=[input1, input2], outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit([X_train, X_train], y_train, batch_size=32, epochs=20, validation_data=([X_test,X_test], y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy= model.evaluate([X_test,X_test], y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.39798, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.39798
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39798
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39798
Epoch 4: early stopping
Test accuracy: 0.8279109001159668


In [49]:
model.evaluate([x_test6,x_test6],y_test6)



[0.6487360000610352, 0.7174803614616394]

In [50]:
import pickle
filename = 'siamese_biLstm+attention_without_tuning.pkl'
pickle.dump(model, open(filename, 'wb'))


Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_1
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_2
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
......

In [51]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, Reshape, GlobalAveragePooling1D, concatenate, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the Siamese BiLSTM model with attention layer and BatchNormalization
embedding_layer = Embedding(input_dim=10000, output_dim=128)
lstm_layer1 = Bidirectional(LSTM(64, return_sequences=True))
bn_layer1 = BatchNormalization()
attention = SeqSelfAttention(attention_activation='relu')
dropout_layer1 = Dropout(0.5)
lstm_layer2 = Bidirectional(LSTM(32, return_sequences=True))
bn_layer2 = BatchNormalization()
dropout_layer2 = Dropout(0.5)
lstm_layer3 = Bidirectional(LSTM(16, return_sequences=True))
bn_layer3 = BatchNormalization()
dropout_layer3 = Dropout(0.5)
pooling_layer = GlobalAveragePooling1D()

input1 = Input(shape=(145,))
input2 = Input(shape=(145,))

encoded1 = dropout_layer1(attention(bn_layer1(lstm_layer1(embedding_layer(input1)))))
encoded2 = dropout_layer1(attention(bn_layer1(lstm_layer1(embedding_layer(input2)))))

merged_layer = concatenate([encoded1, encoded2], axis=-1)

merged_layer = dropout_layer2(bn_layer2(lstm_layer2(merged_layer)))

merged_layer = dropout_layer3(bn_layer3(lstm_layer3(merged_layer)))

merged_layer = pooling_layer(merged_layer)

output_layer = Dense(1, activation='sigmoid')(merged_layer)

model = Model(inputs=[input1, input2], outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit([X_train, X_train], y_train, batch_size=32, epochs=20, validation_data=([X_test,X_test], y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy= model.evaluate([X_test,X_test], y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.39313, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.39313
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39313
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39313
Epoch 4: early stopping
Test accuracy: 0.8269249796867371


In [10]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import *
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, Reshape, GlobalAveragePooling1D, concatenate, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the Siamese BiLSTM model with attention layer
embedding_layer = Embedding(input_dim=10000, output_dim=128)
batch_norm_layer = BatchNormalization()
lstm_layer1 = Bidirectional(LSTM(32, return_sequences=True))
attention = SeqSelfAttention(attention_activation='relu')
dropout_layer1 = Dropout(0.5)
lstm_layer2 = Bidirectional(LSTM(16, return_sequences=True))
dropout_layer2 = Dropout(0.5)
lstm_layer3 = Bidirectional(LSTM(8, return_sequences=True))
dropout_layer3 = Dropout(0.5)
pooling_layer = GlobalAveragePooling1D()

input1 = Input(shape=(145,))
input2 = Input(shape=(145,))

encoded1 = lstm_layer1(batch_norm_layer(embedding_layer(input1)))
encoded2 = lstm_layer1(batch_norm_layer(embedding_layer(input2)))

encoded1 = dropout_layer1(attention(encoded1))
encoded2 = dropout_layer1(attention(encoded2))

merged_layer = concatenate([encoded1, encoded2], axis=-1)

merged_layer = lstm_layer2(merged_layer)
merged_layer = dropout_layer2(merged_layer)

merged_layer = lstm_layer3(merged_layer)
merged_layer = dropout_layer3(merged_layer)

merged_layer = pooling_layer(merged_layer)

output_layer = Dense(1, activation='sigmoid')(merged_layer)

model = Model(inputs=[input1, input2], outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
# Train the model
history = model.fit([X_train, X_train], y_train, batch_size=32, epochs=20, validation_data=([X_test,X_test], y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy= model.evaluate([X_test,X_test], y_test, batch_size=32)
print('Test accuracy:', accuracy)



  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.39124, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.39124
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39124
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39124
Epoch 4: early stopping
Test accuracy: 0.8290939331054688


In [2]:
import numpy as np
import pandas as pd
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, TimeDistributed, Activation, Flatten, RepeatVector, Permute, Multiply, Lambda, Input, Reshape, GlobalAveragePooling1D, concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from keras import backend as K
from keras_self_attention import SeqSelfAttention

# Load the preprocessed data
data1 = pd.read_csv('/kaggle/input/unbalanced/trainu.csv')
data1.dropna(inplace=True)
X = data1['comment_text_processed']
y = data1['Target']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=145)

# Oversample the minority class and undersample the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class_weights_dict = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))

# Build the Siamese BiLSTM model with attention layer
embedding_layer = Embedding(input_dim=10000, output_dim=128)
lstm_layer1 = Bidirectional(LSTM(64, return_sequences=True))
attention = SeqSelfAttention(attention_activation='relu')
dropout_layer1 = Dropout(0.5)
lstm_layer2 = Bidirectional(LSTM(32, return_sequences=True))
dropout_layer2 = Dropout(0.5)
lstm_layer3 = Bidirectional(LSTM(16, return_sequences=True))
dropout_layer3 = Dropout(0.5)
pooling_layer = GlobalAveragePooling1D()

input1 = Input(shape=(145,))
input2 = Input(shape=(145,))

encoded1 = dropout_layer1(attention(lstm_layer1(embedding_layer(input1))))
encoded2 = dropout_layer1(attention(lstm_layer1(embedding_layer(input2))))

merged_layer = concatenate([encoded1, encoded2], axis=-1)

merged_layer = lstm_layer2(merged_layer)
merged_layer = dropout_layer2(merged_layer)

merged_layer = lstm_layer3(merged_layer)
merged_layer = dropout_layer3(merged_layer)

merged_layer = pooling_layer(merged_layer)

output_layer = Dense(1, activation='sigmoid')(merged_layer)

model = Model(inputs=[input1, input2], outputs=output_layer)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks for early stopping and saving the best model
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# Train the model
history = model.fit([X_train, X_train], y_train, batch_size=32, epochs=20, validation_data=([X_test,X_test], y_test), class_weight=class_weights_dict, callbacks=[early_stop, model_checkpoint])

# Load the best model
model.load_weights('best_model.h5')

# Evaluate the model on the testing set
loss, accuracy= model.evaluate([X_test,X_test], y_test, batch_size=32)
print('Test accuracy:', accuracy)


  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.39574, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.39574
Epoch 3/20
Epoch 3: val_loss did not improve from 0.39574
Epoch 4/20
Epoch 4: val_loss did not improve from 0.39574
Epoch 4: early stopping
Test accuracy: 0.8230799436569214


In [13]:
model.evaluate([x_test3,x_test3],y_test3)



[0.6081710457801819, 0.7306397557258606]

In [None]:
model.evaluate([x_test3,x_test3],y_test3)



In [27]:
y_pred=model.predict([x_test3,x_test3])



In [30]:
test4.head()

Unnamed: 0,comment_text,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,other,gender,religion,race,disability,Target,processed_comment_text,comment_text_processed
0,So between the 2 civil lawyers going for the j...,0,0,0,0,0,0,1,0,0,0,0,0,"['civil', 'lawyer', 'going', 'job', 'one', 'st...",civil lawyer going job one stellar reputation ...
1,Hope they have bullet proof glass and bomb bar...,0,0,0,0,0,0,1,0,0,0,0,0,"['hope', 'bullet', 'proof', 'glass', 'bomb', '...",hope bullet proof glass bomb barrier well armed
2,"""...They realize the inter-connectedness betwe...",0,0,0,0,0,0,1,0,0,0,0,0,"['they', 'realize', 'interconnectedness', 'nat...",they realize interconnectedness nation world n...
3,"I'm a Raider fan, but I agree with Finley. Th...",0,0,0,0,0,0,1,0,0,0,0,0,"['raider', 'fan', 'agree', 'finley', 'these', ...",raider fan agree finley these player sit anthe...
4,I voted for Trump and it was not for any reaso...,0,0,0,0,0,0,1,0,0,0,0,0,"['voted', 'trump', 'reason', 'article', 'faceb...",voted trump reason article facebook what mains...


In [31]:
# Define the columns for which to make predictions
cols_to_predict = [col for col in test4.columns if col not in ['comment_text', 'processed_comment_text', 'comment_text_processed','severe_toxicity','obscene','sexual_explicit','identity_attack','insult','threat','other','gender','religion','race','disability']]

# Add predicted values to the test dataset
for i, col in enumerate(cols_to_predict):
    test4[col + '_pred'] = y_pred[:, i]

# Export the test dataset with predicted values to a CSV file
test4.to_csv('test_with_predictions_decimals_siamesebilstm_single.csv', index=False)

In [32]:
pred=pd.read_csv('/kaggle/working/test_with_predictions_decimals_siamesebilstm_single.csv')

In [33]:
pred.head()

Unnamed: 0,comment_text,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,other,gender,religion,race,disability,Target,processed_comment_text,comment_text_processed,Target_pred
0,So between the 2 civil lawyers going for the j...,0,0,0,0,0,0,1,0,0,0,0,0,"['civil', 'lawyer', 'going', 'job', 'one', 'st...",civil lawyer going job one stellar reputation ...,0.228387
1,Hope they have bullet proof glass and bomb bar...,0,0,0,0,0,0,1,0,0,0,0,0,"['hope', 'bullet', 'proof', 'glass', 'bomb', '...",hope bullet proof glass bomb barrier well armed,0.500302
2,"""...They realize the inter-connectedness betwe...",0,0,0,0,0,0,1,0,0,0,0,0,"['they', 'realize', 'interconnectedness', 'nat...",they realize interconnectedness nation world n...,0.046323
3,"I'm a Raider fan, but I agree with Finley. Th...",0,0,0,0,0,0,1,0,0,0,0,0,"['raider', 'fan', 'agree', 'finley', 'these', ...",raider fan agree finley these player sit anthe...,0.157499
4,I voted for Trump and it was not for any reaso...,0,0,0,0,0,0,1,0,0,0,0,0,"['voted', 'trump', 'reason', 'article', 'faceb...",voted trump reason article facebook what mains...,0.302171


In [28]:
# Calculate F1 score
from sklearn.metrics import f1_score
f1 = f1_score(y_test3, y_pred.round(), average='macro')
print('F1 score:', f1)

F1 score: 0.5263960627286325


In [14]:
import pickle
filename = 'siamese_biLstm+attention_with_tuning1.pkl'
pickle.dump(model, open(filename, 'wb'))


Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_1
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_2
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
......