In [1]:
import numpy as np # linear algebra
import pandas as pd 
import re

In [2]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
print("Non Disaster Tweet Count : ",train_df[train_df['target'] == 0]['text'].shape[0])
print("Disaster Tweet Count : ",train_df[train_df['target'] == 1]['text'].shape[0])

Non Disaster Tweet Count :  4342
Disaster Tweet Count :  3271


In [5]:
train_df.groupby('target').count()['text'].reset_index()

Unnamed: 0,target,text
0,0,4342
1,1,3271


In [6]:
train_df['word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))

In [7]:
train_df.loc[:,['text','word_count']].head()

Unnamed: 0,text,word_count
0,Our Deeds are the Reason of this #earthquake M...,13
1,Forest fire near La Ronge Sask. Canada,7
2,All residents asked to 'shelter in place' are ...,22
3,"13,000 people receive #wildfires evacuation or...",8
4,Just got sent this photo from Ruby #Alaska as ...,16


In [8]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [9]:
train_df['modified_text'] = train_df['text'].apply(clean_text)

In [10]:
test_df['modified_text'] = test_df['text'].apply(clean_text)

In [11]:
max(train_df['word_count'])

31

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Embedding, Dense
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

2025-10-23 13:06:58.799264: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761224819.065045      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761224819.143423      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
max_words = 10000
max_len = 50
EMBEDDING_DIM = 100

In [14]:
tokenizer = Tokenizer(
    num_words = max_words
)

tokenizer.fit_on_texts(train_df['modified_text'])

In [15]:
X_train = tokenizer.texts_to_sequences(train_df['modified_text'])
X_test = tokenizer.texts_to_sequences(test_df['modified_text'])

In [16]:
X_train = pad_sequences(
    X_train,
    maxlen = max_len,
    padding = 'post'
)

X_test = pad_sequences(
    X_test,
    maxlen = max_len,
    padding = 'post'
)


In [17]:
Y_train = train_df['target']

In [18]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")

X_train shape: (7613, 50)
X_test shape: (3263, 50)
Vocabulary size: 13527


In [19]:
model = Sequential([
    Embedding(input_dim = max_words, output_dim = EMBEDDING_DIM),
    LSTM(64, dropout = 0.2),
    Dense(32, activation = 'relu'),
    Dropout(0.5),
    Dense(1, activation = 'sigmoid')
])

I0000 00:00:1761224834.030703      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1761224834.031402      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [20]:
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy','f1_score']
)

In [21]:
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, Y_train, test_size=0.2, random_state=42
)

In [22]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

In [23]:
history = model.fit(
    X_train_split, y_train_split,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/10


I0000 00:00:1761224840.046536      62 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - accuracy: 0.5599 - f1_score: 0.6065 - loss: 0.6884 - val_accuracy: 0.5739 - val_f1_score: 0.5976 - val_loss: 0.6823
Epoch 2/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5677 - f1_score: 0.6032 - loss: 0.6863 - val_accuracy: 0.5739 - val_f1_score: 0.5976 - val_loss: 0.6841
Epoch 3/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5750 - f1_score: 0.5965 - loss: 0.6827 - val_accuracy: 0.5739 - val_f1_score: 0.5976 - val_loss: 0.6829
Epoch 4/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5660 - f1_score: 0.6053 - loss: 0.6854 - val_accuracy: 0.5739 - val_f1_score: 0.5976 - val_loss: 0.6820
Epoch 5/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5684 - f1_score: 0.6028 - loss: 0.6855 - val_accuracy: 0.5739 - val_f1_score: 0.5976 - val_loss: 0.6834


In [24]:
pred = model.predict(X_test)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [25]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': pred.ravel()
})

submission.to_csv('submission_lstm.csv', index=False)