In [76]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split 

from gensim.models import Word2Vec

In [77]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
target = df_train['target']

In [78]:
df_train.drop(['id','keyword' , 'location'] , axis = 1)

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [79]:
def preprocess(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove URLs
    text = re.sub(r'<.*?>', '', text) # Remove HTML tags
    text = re.sub(r'\d+', '', text) # Remove digits
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\n', '', text) # Remove newline characters
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    text = re.sub('\[.*?\]', '', text)
    return text

In [80]:
df_train['clean_text'] = df_train['text'].apply(preprocess)


In [81]:
df_train['processed_text'] = df_train['text'].apply(preprocess)

# 2. Tokenize the text
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(df_train['processed_text'])

total_words = len(tokenizer.word_index) + 1
max_len = 256

sequences = tokenizer.texts_to_sequences(df_train['processed_text'])
text_padded = pad_sequences(sequences, maxlen=max_len, padding='post')

In [82]:
X_train, X_test, y_train, y_test = train_test_split(text_padded, target, test_size=0.2, random_state=42)

In [83]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=total_words, output_dim=128, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')  # For binary classification; use 'softmax' for multi-class
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [84]:
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 248ms/step - accuracy: 0.6468 - loss: 0.6058 - val_accuracy: 0.8017 - val_loss: 0.4391
Epoch 2/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 243ms/step - accuracy: 0.8966 - loss: 0.2720 - val_accuracy: 0.7853 - val_loss: 0.5056
Epoch 3/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 245ms/step - accuracy: 0.9598 - loss: 0.1307 - val_accuracy: 0.7787 - val_loss: 0.5611
Epoch 4/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 245ms/step - accuracy: 0.9737 - loss: 0.0907 - val_accuracy: 0.7735 - val_loss: 0.6263
Epoch 5/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 244ms/step - accuracy: 0.9797 - loss: 0.0617 - val_accuracy: 0.7682 - val_loss: 0.7286
Epoch 6/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 246ms/step - accuracy: 0.9774 - loss: 0.0507 - val_accuracy: 0.7827 - val_loss: 0.8129
Epoch 7/20

In [86]:
df_test['processed_text'] = df_test['text'].apply(preprocess)

test_sequences = tokenizer.texts_to_sequences(df_test['processed_text'])
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

predictions = model.predict(test_padded)

predicted_classes = (predictions > 0.5).astype(int)

df_test['target'] = predicted_classes

df_test[['id', 'target']].to_csv('submission.csv', index=False)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 76ms/step
