In [2]:
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip glove.twitter.27B.zip

In [3]:
import numpy as np # linear algebra
import pandas as pd 
import re

In [4]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [5]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
print("Non Disaster Tweet Count : ",train_df[train_df['target'] == 0]['text'].shape[0])
print("Disaster Tweet Count : ",train_df[train_df['target'] == 1]['text'].shape[0])

Non Disaster Tweet Count :  4342
Disaster Tweet Count :  3271


In [7]:
train_df.groupby('target').count()['text'].reset_index()

Unnamed: 0,target,text
0,0,4342
1,1,3271


In [8]:
train_df['word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))

In [9]:
train_df.loc[:,['text','word_count']].head()

Unnamed: 0,text,word_count
0,Our Deeds are the Reason of this #earthquake M...,13
1,Forest fire near La Ronge Sask. Canada,7
2,All residents asked to 'shelter in place' are ...,22
3,"13,000 people receive #wildfires evacuation or...",8
4,Just got sent this photo from Ruby #Alaska as ...,16


In [10]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9@\#\s]', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [11]:
train_df['modified_text'] = train_df['text'].apply(clean_text)

In [12]:
test_df['modified_text'] = test_df['text'].apply(clean_text)

In [13]:
max(train_df['word_count'])

31

In [14]:
train_df['target'].value_counts(normalize=True)

target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Embedding, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

2025-10-27 13:50:36.975316: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761573037.357940      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761573037.489872      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [16]:
train_df['word_count'].describe()

count    7613.000000
mean       14.903586
std         5.732604
min         1.000000
25%        11.000000
50%        15.000000
75%        19.000000
max        31.000000
Name: word_count, dtype: float64

In [17]:
max_words = 20000
max_len = 50
EMBEDDING_DIM = 100

In [18]:
tokenizer = Tokenizer(
    num_words = max_words
)

tokenizer.fit_on_texts(train_df['modified_text'])

In [19]:
X_train = tokenizer.texts_to_sequences(train_df['modified_text'])
X_test = tokenizer.texts_to_sequences(test_df['modified_text'])

In [20]:
X_train = pad_sequences(
    X_train,
    maxlen = max_len,
    padding = 'post'
)

X_test = pad_sequences(
    X_test,
    maxlen = max_len,
    padding = 'post'
)


In [21]:
embeddings_index = {}

with open("/kaggle/working/glove.twitter.27B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f"Loaded {len(embeddings_index)} word vectors.")

Loaded 1193514 word vectors.


In [22]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [23]:
Y_train = train_df['target']

In [24]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")

X_train shape: (7613, 50)
X_test shape: (3263, 50)
Vocabulary size: 16894


In [25]:
model = Sequential([
    Embedding(
        input_dim=max_words,
        output_dim=EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=True
    ),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    Dense(32, activation = 'relu'),
    Dropout(0.5),
    Dense(1, activation = 'sigmoid')
])

I0000 00:00:1761573082.858678      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1761573082.859416      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [26]:
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

In [27]:
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, Y_train, test_size=0.2, random_state=42
)

In [28]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

In [29]:
history = model.fit(
    X_train_split, y_train_split,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 217ms/step - accuracy: 0.6436 - loss: 0.6186 - val_accuracy: 0.8148 - val_loss: 0.4328
Epoch 2/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 200ms/step - accuracy: 0.8083 - loss: 0.4512 - val_accuracy: 0.8201 - val_loss: 0.4202
Epoch 3/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 201ms/step - accuracy: 0.8374 - loss: 0.4006 - val_accuracy: 0.8306 - val_loss: 0.4042
Epoch 4/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 202ms/step - accuracy: 0.8664 - loss: 0.3435 - val_accuracy: 0.8024 - val_loss: 0.4455
Epoch 5/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 201ms/step - accuracy: 0.8803 - loss: 0.3080 - val_accuracy: 0.8339 - val_loss: 0.4356
Epoch 6/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 202ms/step - accuracy: 0.9062 - loss: 0.2469 - val_accuracy: 0.8102 - val_loss: 0.4860
Epoch 6: early stoppin

In [30]:
predd = model.predict(X_test)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 49ms/step


In [31]:
print(len(tokenizer.word_index))


16894


In [32]:
pred = (predd > 0.5).astype(int).ravel()

In [33]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': pred.ravel()
})

submission.to_csv('submission.csv', index=False)