<a href="https://colab.research.google.com/github/sheemapatel/nlp--/blob/main/25_9_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Deep Learning Imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# --- 1. Dataset and Preprocessing Setup (Re-simulating from previous steps) ---

# Re-simulate the cleaned and labeled data
data_sim = {
    'final_text': [
        'forest fire near la pray affected', 'saw new batman movie disaster lol',
        'massive flood warning issued new york city stay safe', 'phone soo slow today might well throw',
        'breaking news 70 magnitude earthquake rock japan damage widespread', 'new album firee cant stop listening',
        'devastating tornado hit oklahoma emergency response needed now', 'check cool website',
        'emergency crew working hard rescue people collapsed building', 'loving weather today everyone',
        'help missing people after explosion', 'bad day traffic jam is worst', 'shelter needed urgent',
        'amazing sunset must see'
    ],
    'target': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
}
df = pd.DataFrame(data_sim)

# Global parameters for DL models
MAX_WORDS = 1000  # Vocabulary size for the tokenizer
MAX_LEN = 20      # Max sequence length (padding/truncation)
EMBEDDING_DIM = 100 # Must match the size of pre-trained vectors (e.g., GloVe 100d)

X = df['final_text']
y = df.target.values

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2. Tokenization and Padding
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<unk>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')


# --- 3. Pre-trained Word2Vec/GloVe Embedding Layer Preparation ---

# Load Pre-trained Embeddings (SIMULATION ONLY)
# In a real environment, you would load the entire GloVe/Word2Vec file here.
# For example:
# embeddings_index = {}
# with open('glove.6B.100d.txt', encoding='utf-8') as f:
#     for line in f:
#         values = line.split()
#         word = values[0]
#         coefs = np.asarray(values[1:], dtype='float32')
#         embeddings_index[word] = coefs

# SIMULATE a simple embedding index and matrix
embeddings_index = {
    'fire': np.random.rand(EMBEDDING_DIM), 'flood': np.random.rand(EMBEDDING_DIM),
    'earthquake': np.random.rand(EMBEDDING_DIM), 'safe': np.random.rand(EMBEDDING_DIM),
    'lol': np.random.rand(EMBEDDING_DIM), 'movie': np.random.rand(EMBEDDING_DIM),
    'new': np.random.rand(EMBEDDING_DIM), 'needed': np.random.rand(EMBEDDING_DIM),
    'rescue': np.random.rand(EMBEDDING_DIM), 'explosion': np.random.rand(EMBEDDING_DIM),
    'shelter': np.random.rand(EMBEDDING_DIM), 'bad': np.random.rand(EMBEDDING_DIM)
}
word_index = tokenizer.word_index
VOCAB_SIZE = min(MAX_WORDS, len(word_index) + 1)

# Create the embedding matrix
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < MAX_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


# --- 4. Deep Learning Model Architectures ---

def create_embedding_layer():
    return Embedding(
        VOCAB_SIZE,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False  # Crucial: Keep pre-trained weights fixed
    )

def build_lstm_model():
    model = Sequential()
    model.add(create_embedding_layer())
    model.add(LSTM(64))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_cnn_model():
    model = Sequential()
    model.add(create_embedding_layer())
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_bi_lstm_model():
    model = Sequential()
    model.add(create_embedding_layer())
    model.add(Bidirectional(LSTM(64)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# --- 5. Training and Evaluation ---

print("## Deep Learning Model Training and Evaluation üß†")

models = {
    "LSTM": build_lstm_model(),
    "CNN": build_cnn_model(),
    "Bi-LSTM": build_bi_lstm_model()
}

dl_results = {}
history_dict = {}

for name, model in models.items():
    print(f"\n--- Training **{name}** Model ---")
    # Training with a few epochs due to small simulated dataset
    history = model.fit(
        X_train_padded, y_train,
        epochs=10,
        batch_size=8,
        validation_split=0.1,
        verbose=0
    )
    history_dict[name] = history

    # Evaluation
    y_pred_proba = model.predict(X_test_padded)
    y_pred = (y_pred_proba > 0.5).astype(int)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    dl_results[name] = {'Accuracy': accuracy, 'F1-Score': f1, 'Predictions': y_pred}

    print(f"**{name} Performance on Test Set:**")
    print(f"Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")

# --- 6. Comparison with Traditional ML ---

# Assuming Logistic Regression (LR) from previous step achieved these hypothetical scores
lr_accuracy = 0.70
lr_f1 = 0.65

print("\n## Performance Comparison üìà")

comparison_data = [
    {'Model': 'Logistic Regression (ML)', 'Accuracy': lr_accuracy, 'F1-Score': lr_f1}
]
for name, metrics in dl_results.items():
    comparison_data.append({'Model': name, 'Accuracy': metrics['Accuracy'], 'F1-Score': metrics['F1-Score']})

comparison_df = pd.DataFrame(comparison_data)

print("\n### Deep Learning vs. Traditional ML Metrics ###")
print(comparison_df.round(4).to_markdown(index=False))


# --- 7. Error Analysis (Using the best DL model's predictions) ---

# Find the best DL model based on F1-Score
best_dl_model_name = max(dl_results, key=lambda k: dl_results[k]['F1-Score'])
best_dl_predictions = dl_results[best_dl_model_name]['Predictions']

print(f"\n## Error Analysis for Best DL Model: **{best_dl_model_name}** üßê")

# Create a DataFrame of the test set results for easy analysis
test_df = pd.DataFrame({
    'Tweet': X_test.reset_index(drop=True),
    'True Label': y_test,
    'Predicted Label': best_dl_predictions.flatten()
})

# Misclassified Positive Tweets (False Negatives: True=1, Predicted=0)
fn_tweets = test_df[(test_df['True Label'] == 1) & (test_df['Predicted Label'] == 0)].head(5)
print("\n### Misclassified Disaster Tweets (False Negatives) ###")
print(fn_tweets[['Tweet', 'True Label', 'Predicted Label']].to_markdown(index=False))

# Misclassified Negative Tweets (False Positives: True=0, Predicted=1)
fp_tweets = test_df[(test_df['True Label'] == 0) & (test_df['Predicted Label'] == 1)].head(5)
print("\n### Misclassified Irrelevant Tweets (False Positives) ###")
print(fp_tweets[['Tweet', 'True Label', 'Predicted Label']].to_markdown(index=False))


# --- 8. Conclusion ---

print("\n## Conclusion on Model Performance ‚úÖ")
best_overall_score = comparison_df['F1-Score'].max()
best_overall_model = comparison_df.loc[comparison_df['F1-Score'].idxmax(), 'Model']

print(f"\nBased on the F1-Score, the **{best_overall_model}** model achieved the highest performance ({best_overall_score:.4f}).")

if 'LSTM' in best_overall_model or 'CNN' in best_overall_model:
    print("In real-world disaster classification with a large dataset, Deep Learning models (LSTM/CNN) using **pre-trained Word2Vec/GloVe embeddings** typically provide significant improvements over traditional ML models.")
    print("They achieve this by:")
    print("* **Capturing Context:** Recurrent layers (LSTM/Bi-LSTM) understand word order and context in a sequence.")
    print("* **Semantic Understanding:** Pre-trained embeddings provide semantic meaning (e.g., 'quake' is close to 'earthquake'), which traditional TF-IDF often misses.")
    print("However, on small, heavily-cleaned datasets (like the one simulated), the improvement may not be drastic, as the simple feature set might not benefit fully from the complexity of a neural network.")
else:
    print("The high performance of the Traditional ML model (Logistic Regression) in this comparison suggests that for the small feature set provided, the linear separability achieved by TF-IDF is highly effective.")

## Deep Learning Model Training and Evaluation üß†

--- Training **LSTM** Model ---




[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 189ms/step
**LSTM Performance on Test Set:**
Accuracy: 0.3333, F1-Score: 0.5000

--- Training **CNN** Model ---




[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 168ms/step
**CNN Performance on Test Set:**
Accuracy: 1.0000, F1-Score: 1.0000

--- Training **Bi-LSTM** Model ---




[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 353ms/step
**Bi-LSTM Performance on Test Set:**
Accuracy: 1.0000, F1-Score: 1.0000

## Performance Comparison üìà

### Deep Learning vs. Traditional ML Metrics ###
| Model                    |   Accuracy |   F1-Score |
|:-------------------------|-----------:|-----------:|
| Logistic Regression (ML) |     0.7    |       0.65 |
| LSTM                     |     0.3333 |       0.5  |
| CNN                      |     1      |       1    |
| Bi-LSTM                  |     1      |       1    |

## Error Analysis for Best DL Model: **CNN** üßê

### Misclassified Disaster Tweets (False Negatives) ###
| Tweet   | True Label   | Predicted Label   |
|---------|--------------|-------------------|

### Misclassified Irrelevant Tweets (False Positives) ###
| Tweet   | True Label   | Predicted Label   |
|---------|--------------|-------------------|

## Conclusion on Model Performance ‚úÖ

Based o