In [37]:
import pandas as pd
from NlpStudyFunctions import CleanDataframe
from sklearn.model_selection import train_test_split

train_df = pd.read_csv(r'../data/raw/train.csv')

#clean the df
#train_df = CleanDataframe(train_df_raw,'text')

#split train-test set
x_train, x_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.2, random_state=42)

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Convert lists to NumPy arrays (if not already done)
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

# Number of words to keep based on word frequency.
num_words = 10000

# Using the Tokenizer class from Keras
tokenizer = Tokenizer(num_words=num_words, lower=False)
tokenizer.fit_on_texts(x_train)

# Convert x_train and x_test to lists of strings if they are not already in that format
x_train_list = x_train.tolist() if isinstance(x_train, np.ndarray) else x_train
x_test_list = x_test.tolist() if isinstance(x_test, np.ndarray) else x_test

# Convert texts to sequences of integers
x_train_sequences = tokenizer.texts_to_sequences(x_train_list)
x_test_sequences = tokenizer.texts_to_sequences(x_test_list)

# Calculate the maximum sequence length
max_sequence_length = max(len(x) for x in x_train_sequences)

# Pad sequences to the same length
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_sequence_length)
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_sequence_length)

In [39]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, Conv1D, MaxPooling1D, Dense, Dropout
from keras.optimizers import Adam

# Assuming num_words and max_sequence_length are defined
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=200, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True, dropout=0.2))
model.add(LSTM(512, dropout=0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=32)

Epoch 1/10




[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 104ms/step - accuracy: 0.6670 - loss: 0.6075 - val_accuracy: 0.7932 - val_loss: 0.4717
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 138ms/step - accuracy: 0.8722 - loss: 0.3205 - val_accuracy: 0.7873 - val_loss: 0.4901
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 123ms/step - accuracy: 0.9201 - loss: 0.2221 - val_accuracy: 0.7623 - val_loss: 0.5066
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 108ms/step - accuracy: 0.9578 - loss: 0.1333 - val_accuracy: 0.7584 - val_loss: 0.6253
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 109ms/step - accuracy: 0.9717 - loss: 0.0828 - val_accuracy: 0.7649 - val_loss: 0.9784
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 110ms/step - accuracy: 0.9739 - loss: 0.0609 - val_accuracy: 0.7584 - val_loss: 1.1586
Epoch 7/10
[1m191/19

<keras.src.callbacks.history.History at 0x20d26899370>

In [40]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.7355 - loss: 1.4337
Test Loss: 1.3415
Test Accuracy: 0.7380


In [41]:
#save model
model.save(r'../models/DL_MODEL.keras')

In [42]:
import keras

text_test_df = pd.read_csv(r'../data/raw/test.csv')

text_test_df = np.array(text_test_df['text'])

tokenizer.fit_on_texts(text_test_df)

text_test_df_list = text_test_df.tolist() if isinstance(text_test_df, np.ndarray) else text_test_df

test_df_sequences = tokenizer.texts_to_sequences(text_test_df_list)

# Pad sequences to the same length
test_df_padded = pad_sequences(test_df_sequences, maxlen=max_sequence_length)

model = keras.models.load_model(r'../models/DL_MODEL.keras')

pred = model.predict(test_df_padded)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step


In [43]:
predictions = pd.DataFrame(pred)

# Assuming `predictions` is already a DataFrame
predictions.columns = ['column_0', 'column_1']

# Update `column_0` based on the comparison
for index, row in predictions.iterrows():
    if row['column_0'] < row['column_1']:
        predictions.at[index, 'column_0'] = 1
    else:
        predictions.at[index, 'column_0'] = 0

# Convert `column_0` to integer type
predictions['column_0'] = predictions['column_0'].astype(int)

# Create the binary_predictions variable
binary_predictions = predictions['column_0']

# Create the submission DataFrame
submission = pd.DataFrame({'id': test_df_raw['id'], 'target': binary_predictions})

In [44]:
submission.to_csv(r'../data/final/submission.csv',index=False)

submission = pd.read_csv(r'../data/final/submission.csv')
submission.describe()

Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.523751
std,3146.427221,0.499512
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,1.0
75%,8176.0,1.0
max,10875.0,1.0


In [45]:
submission["target"].value_counts()

target
1    1709
0    1554
Name: count, dtype: int64

In [46]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [47]:
import kaggle

# Replace 'submission.csv' with the path to your submission file
submission_file = r'../data/final/submission.csv'

# Replace 'Message' with your submission message
submission_message = 'DL model try'

# Call the submit function from kaggle package
kaggle.api.competition_submit(submission_file, submission_message, competition='nlp-getting-started')

100%|██████████| 25.4k/25.4k [00:00<00:00, 48.1kB/s]


Successfully submitted to Natural Language Processing with Disaster Tweets