In [3]:
import pandas as pd
from NlpStudyFunctions import CleanDataframe
from sklearn.model_selection import train_test_split

train_df_raw = pd.read_csv(r'../train.csv')

#clean the df
train_df = CleanDataframe(train_df_raw,'text')

#split train-test set
x_train, x_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.2, random_state=42)


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


# Convert lists to NumPy arrays (if not already done)
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

# Number of words to keep based on word frequency.
num_words = 10000

# Using the Tokenizer class from Keras
tokenizer = Tokenizer(num_words=num_words) 
tokenizer.fit_on_texts(x_train)

# Convert x_train and x_test to lists of strings if they are not already in that format
x_train_list = x_train.tolist() if isinstance(x_train, np.ndarray) else x_train
x_test_list = x_test.tolist() if isinstance(x_test, np.ndarray) else x_test

# Convert texts to sequences of integers
x_train_sequences = tokenizer.texts_to_sequences(x_train_list)
x_test_sequences = tokenizer.texts_to_sequences(x_test_list)


# Calculate the maximum sequence length
max_sequence_length = max(len(x) for x in x_train_sequences)

# Pad sequences to the same length
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_sequence_length)
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_sequence_length)

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

#building deep learning model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=500, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True, dropout=0.2))
model.add(LSTM(512, dropout=0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))


# Explicitly build the model
model.build(input_shape=(None, max_sequence_length))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [6]:
model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=32)


Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 242ms/step - accuracy: 0.6756 - loss: 0.5921 - val_accuracy: 0.7859 - val_loss: 0.4739
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 231ms/step - accuracy: 0.8781 - loss: 0.3204 - val_accuracy: 0.8050 - val_loss: 0.4598
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 231ms/step - accuracy: 0.9242 - loss: 0.2037 - val_accuracy: 0.7781 - val_loss: 0.5441
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 230ms/step - accuracy: 0.9549 - loss: 0.1274 - val_accuracy: 0.7649 - val_loss: 0.7068
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 230ms/step - accuracy: 0.9672 - loss: 0.0835 - val_accuracy: 0.7459 - val_loss: 0.8877
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 239ms/step - accuracy: 0.9714 - loss: 0.0673 - val_accuracy: 0.7354 - val_loss: 0.9688
Epoch 7/10

<keras.src.callbacks.history.History at 0x176f1882480>

In [7]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 82ms/step - accuracy: 0.7450 - loss: 1.8533
Test Loss: 1.7992
Test Accuracy: 0.7590


In [15]:
test_df_raw = pd.read_csv(r'../test.csv')

# Clean the df
text_test_df = CleanDataframe(test_df_raw,'text')

# Convert lists to NumPy arrays (if not already done)
text_test_df = np.array(text_test_df)

um_words = 10000

# Convert back to list
text_test_df = text_test_df.tolist() if isinstance(text_test_df, np.ndarray) else text_test_df

# Convert numbers to strings
text_test_df = [str(item) for item in text_test_df]

# Now, proceed with tokenization
tokenizer.fit_on_texts(text_test_df)

# Fit tokenizer on text
tokenizer.fit_on_texts(text_test_df)

test_df_sequences = tokenizer.texts_to_sequences(text_test_df)

# Calculate the maximum sequence length
max_sequence_length = max(len(x) for x in test_df_sequences)

# Pad sequences to the same length
test_df_padded = pad_sequences(test_df_sequences, maxlen=max_sequence_length)


predictions = model.predict(test_df_padded)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 152ms/step


In [32]:
import pandas as pd
import numpy as np

# Assuming predictions is a NumPy array containing the predictions
# Also assuming threshold is defined earlier in your code

binary_predictions = [1 if pred[0] > threshold else 0 for pred in predictions]

# Create a DataFrame with 'id' column from test_df_raw and 'binary_predictions'
submission = pd.DataFrame({'id': test_df_raw['id'], 'target': binary_predictions})

# Now, you have a DataFrame named submission with 'id' and 'prediction' columns


In [37]:
submission.to_csv(r'../submission.csv',index=False)

In [39]:
import kaggle

# Replace 'submission.csv' with the path to your submission file
submission_file = r'../submission.csv'

# Replace 'Message' with your submission message
submission_message = 'DL model try'

# Call the submit function from kaggle package
kaggle.api.competition_submit(submission_file, submission_message, competition='nlp-getting-started')

100%|██████████| 25.4k/25.4k [00:00<00:00, 50.5kB/s]


Successfully submitted to Natural Language Processing with Disaster Tweets