<a href="https://colab.research.google.com/github/saikirankesoju/NLP/blob/main/NLP_12-09-2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten, LSTM, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

df = pd.read_csv("/content/train.csv")
print(df.head())

texts = df["text"].values
labels = df["target"].values

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in STOPWORDS]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)

X_train, X_val, y_train, y_val = train_test_split(df["clean_text"], labels, test_size=0.2, random_state=42)

count_vec = CountVectorizer()
X_train_count = count_vec.fit_transform(X_train)
X_val_count = count_vec.transform(X_val)

tfidf_vec = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_val_tfidf = tfidf_vec.transform(X_val)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_val_tfidf)

svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_val_tfidf)

print("\n--- Logistic Regression (TF-IDF) ---")
print(classification_report(y_val, y_pred_lr))

print("\n--- SVM (TF-IDF) ---")
print(classification_report(y_val, y_pred_svm))

tokenizer = Tokenizer(num_words=10000, oov_token="")
tokenizer.fit_on_texts(df["clean_text"])
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

max_len = 30
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding="post")

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

mlp = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
mlp.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
mlp.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_val_pad, y_val))

y_pred_mlp = (mlp.predict(X_val_pad) > 0.5).astype("int32")

print("\n--- MLP (Embeddings) ---")
print(classification_report(y_val, y_pred_mlp))

cnn = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    Conv1D(128, 5, activation="relu"),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
cnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
cnn.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_val_pad, y_val))

y_pred_cnn = (cnn.predict(X_val_pad) > 0.5).astype("int32")

print("\n--- CNN (Embeddings) ---")
print(classification_report(y_val, y_pred_cnn))

lstm = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    LSTM(128, dropout=0.3, recurrent_dropout=0.3),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")
])
lstm.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
lstm.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_val_pad, y_val))

y_pred_lstm = (lstm.predict(X_val_pad) > 0.5).astype("int32")

print("\n--- LSTM (Embeddings) ---")
print(classification_report(y_val, y_pred_lstm))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  

--- Logistic Regression (TF-IDF) ---
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       874
           1       0.82      0.68      0.74       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523


--- SVM (TF-IDF) ---
              precision    recall  f1-score   support

           0       0.79



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.5695 - loss: 0.6790 - val_accuracy: 0.7229 - val_loss: 0.6233
Epoch 2/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7142 - loss: 0.5789 - val_accuracy: 0.7420 - val_loss: 0.5088
Epoch 3/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.8322 - loss: 0.3942 - val_accuracy: 0.8011 - val_loss: 0.4466
Epoch 4/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8755 - loss: 0.3037 - val_accuracy: 0.7984 - val_loss: 0.4893
Epoch 5/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9050 - loss: 0.2446 - val_accuracy: 0.7919 - val_loss: 0.4968
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

--- MLP (Embeddings) ---
              precision    recall  f1-score   support

           0       0.79      0.87      0.83   



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.6193 - loss: 0.6413 - val_accuracy: 0.7886 - val_loss: 0.4570
Epoch 2/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.8791 - loss: 0.3279 - val_accuracy: 0.7859 - val_loss: 0.4940
Epoch 3/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.9399 - loss: 0.1665 - val_accuracy: 0.7531 - val_loss: 0.5857
Epoch 4/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.9660 - loss: 0.0940 - val_accuracy: 0.7603 - val_loss: 0.7736
Epoch 5/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.9791 - loss: 0.0542 - val_accuracy: 0.7551 - val_loss: 0.9325
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

--- CNN (Embeddings) ---
              precision    recall  f1-score   support

           0       0.77      0.81      0.79   



[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 106ms/step - accuracy: 0.5830 - loss: 0.6715 - val_accuracy: 0.7768 - val_loss: 0.5095
Epoch 2/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 100ms/step - accuracy: 0.8083 - loss: 0.4681 - val_accuracy: 0.7925 - val_loss: 0.5017
Epoch 3/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 101ms/step - accuracy: 0.8507 - loss: 0.4033 - val_accuracy: 0.7886 - val_loss: 0.5422
Epoch 4/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 94ms/step - accuracy: 0.8908 - loss: 0.3245 - val_accuracy: 0.7663 - val_loss: 0.5945
Epoch 5/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 102ms/step - accuracy: 0.9155 - loss: 0.2686 - val_accuracy: 0.5739 - val_loss: 0.6342
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step

--- LSTM (Embeddings) ---
              precision    recall  f1-score   support

           0       0.57      1.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
