In [22]:
import pandas as pd
from NlpStudyFunctions import CleanDataframe
from sklearn.model_selection import train_test_split

train_df_raw = pd.read_csv(r'../data/raw/train.csv')

#clean the df
train_df = CleanDataframe(train_df_raw,'text')

#split train-test set
x_train, x_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.2, random_state=42)


In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Convert lists to NumPy arrays (if not already done)
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

# Number of words to keep based on word frequency.
num_words = 10000

# Using the Tokenizer class from Keras
tokenizer = Tokenizer(num_words=num_words, lower=False)
tokenizer.fit_on_texts(x_train)

# Convert x_train and x_test to lists of strings if they are not already in that format
x_train_list = x_train.tolist() if isinstance(x_train, np.ndarray) else x_train
x_test_list = x_test.tolist() if isinstance(x_test, np.ndarray) else x_test

# Convert texts to sequences of integers
x_train_sequences = tokenizer.texts_to_sequences(x_train_list)
x_test_sequences = tokenizer.texts_to_sequences(x_test_list)

# Calculate the maximum sequence length
max_sequence_length = max(len(x) for x in x_train_sequences)

# Pad sequences to the same length
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_sequence_length)
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_sequence_length)

In [3]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, Conv1D, MaxPooling1D, Dense, Dropout
from keras.optimizers import Adam

# Assuming num_words and max_sequence_length are defined
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=500, input_length=max_sequence_length, trainable=True))

# Add Conv1D layer
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Use Bidirectional LSTM
model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(Bidirectional(LSTM(256, dropout=0.2, recurrent_dropout=0.2)))

# Add more Dense layers with regularization
model.add(Dense(1024, activation='relu', kernel_regularizer='l2'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu', kernel_regularizer='l2'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))

# Final output layer
model.add(Dense(2, activation='softmax'))

# Compile model with a different optimizer and learning rate
optimizer = Adam(learning_rate=1e-4)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Fit the model
model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=32)


Epoch 1/10




[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 165ms/step - accuracy: 0.5750 - loss: 9.6114 - val_accuracy: 0.5739 - val_loss: 5.1439
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 164ms/step - accuracy: 0.6107 - loss: 4.2679 - val_accuracy: 0.7794 - val_loss: 2.2183
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 159ms/step - accuracy: 0.8448 - loss: 1.7851 - val_accuracy: 0.8056 - val_loss: 1.1063
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 154ms/step - accuracy: 0.9092 - loss: 0.7729 - val_accuracy: 0.7873 - val_loss: 0.7166
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 146ms/step - accuracy: 0.9383 - loss: 0.3857 - val_accuracy: 0.7925 - val_loss: 0.6256
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 173ms/step - accuracy: 0.9517 - loss: 0.2256 - val_accuracy: 0.7643 - val_loss: 0.7546
Epoch 7/10
[1m191/19

<keras.src.callbacks.history.History at 0x1f1c7844080>

In [16]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.7460 - loss: 1.1323
Test Loss: 1.0533
Test Accuracy: 0.7603


In [17]:
#save model
model.save(r'../models/DL_MODEL.keras')

In [14]:
import keras

test_df_raw = pd.read_csv(r'../data/raw/test.csv')

# Clean the df
text_test_df = CleanDataframe(test_df_raw,'text')

# Convert lists to NumPy arrays (if not already done)
text_test_df = np.array(text_test_df)

# Convert back to list
text_test_df = text_test_df.tolist() if isinstance(text_test_df, np.ndarray) else text_test_df

# Convert numbers to strings
text_test_df = [str(item) for item in text_test_df]

# Now, proceed with tokenization
tokenizer.fit_on_texts(text_test_df)

# Fit tokenizer on text
tokenizer.fit_on_texts(text_test_df)

test_df_sequences = tokenizer.texts_to_sequences(text_test_df)

# Calculate the maximum sequence length
max_sequence_length = max(len(x) for x in test_df_sequences)

# Pad sequences to the same length
test_df_padded = pad_sequences(test_df_sequences, maxlen=max_sequence_length)

model = keras.models.load_model(r'../models/DL_MODEL.keras')

predictions = model.predict(test_df_padded)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 60ms/step


In [7]:
predictions = pd.DataFrame(predictions)

# Assuming `predictions` is already a DataFrame
predictions.columns = ['column_0', 'column_1']

# Update `column_0` based on the comparison
for index, row in predictions.iterrows():
    if row['column_0'] < row['column_1']:
        predictions.at[index, 'column_0'] = 1
    else:
        predictions.at[index, 'column_0'] = 0

# Convert `column_0` to integer type
predictions['column_0'] = predictions['column_0'].astype(int)

# Create the binary_predictions variable
binary_predictions = predictions['column_0']

# Create the submission DataFrame
submission = pd.DataFrame({'id': test_df_raw['id'], 'target': binary_predictions})

         id  target
0         0       0
1         2       0
2         3       1
3         9       0
4        11       1
...     ...     ...
3258  10861       0
3259  10865       0
3260  10868       1
3261  10874       0
3262  10875       0

[3263 rows x 2 columns]


In [10]:
submission.to_csv(r'../data/final/submission.csv',index=False)

submission = pd.read_csv(r'../data/final/submission.csv')
submission.describe()

Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.403923
std,3146.427221,0.490758
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,0.0
75%,8176.0,1.0
max,10875.0,1.0


In [11]:
import kaggle

# Replace 'submission.csv' with the path to your submission file
submission_file = r'../data/final/submission.csv'

# Replace 'Message' with your submission message
submission_message = 'DL model try'

# Call the submit function from kaggle package
#kaggle.api.competition_submit(submission_file, submission_message, competition='nlp-getting-started')