In [9]:
# Import necessary modules
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split

# Upload dataset files in Colab
from google.colab import files
uploaded = files.upload()

# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Text preprocessing function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+', '', text)   # Remove URLs
    text = re.sub(r'@\w+', '', text)            # Remove mentions
    text = re.sub(r'[^a-z\s]', '', text)        # Remove special chars and digits
    text = re.sub(r'\s+', ' ', text).strip()   # Remove extra spaces
    return text

# Clean train and test texts
train_df['clean_text'] = train_df['text'].map(clean_text)
test_df['clean_text'] = test_df['text'].map(clean_text)

# Tokenizer setup
MAX_VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['clean_text'])

# Convert texts to sequences and pad
MAX_SEQUENCE_LENGTH = 100

train_seq = tokenizer.texts_to_sequences(train_df['clean_text'])
train_padded = pad_sequences(train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

test_seq = tokenizer.texts_to_sequences(test_df['clean_text'])
test_padded = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Prepare labels
y = train_df['target'].values

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(train_padded, y, test_size=0.2, random_state=42)

# Build LSTM deep learning model
EMBEDDING_DIM = 100
model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=64)

# Predict on test set and create submission file
pred_prob = model.predict(test_padded)
pred_labels = (pred_prob > 0.5).astype(int).reshape(-1)

submission = pd.DataFrame({
    'id': test_df['id'],
    'target': pred_labels
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")

# Download the submission file
files.download('submission.csv')

# Function to predict any single tweet text
def predict_single_tweet(tweet_text):
    cleaned = clean_text(tweet_text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    prob = model.predict(padded)[0][0]
    label = 1 if prob > 0.5 else 0
    return label, prob

# Example interactive testing:
example_tweet = "There is a wildfire near my house, please stay safe!"
label, probability = predict_single_tweet(example_tweet)
print(f"Tweet: {example_tweet}")
print(f"Predicted label: {label} (1=disaster, 0=not disaster), Confidence: {probability:.4f}")


Saving sample_submission.csv to sample_submission (2).csv
Saving test.csv to test (2).csv
Saving train.csv to train (2).csv




Epoch 1/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 337ms/step - accuracy: 0.6224 - loss: 0.6400 - val_accuracy: 0.7984 - val_loss: 0.4417
Epoch 2/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 333ms/step - accuracy: 0.8670 - loss: 0.3320 - val_accuracy: 0.7938 - val_loss: 0.4496
Epoch 3/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 327ms/step - accuracy: 0.9173 - loss: 0.2336 - val_accuracy: 0.7728 - val_loss: 0.5030
Epoch 4/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 332ms/step - accuracy: 0.9450 - loss: 0.1688 - val_accuracy: 0.7905 - val_loss: 0.6099
Epoch 5/5
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 331ms/step - accuracy: 0.9598 - loss: 0.1247 - val_accuracy: 0.7814 - val_loss: 0.7214
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step
Submission file saved as submission.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Tweet: There is a wildfire near my house, please stay safe!
Predicted label: 1 (1=disaster, 0=not disaster), Confidence: 0.7902
