<a href="https://colab.research.google.com/github/sheemapatel/nlp--/blob/main/12_9_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Conv1D, MaxPooling1D, LSTM, Dropout, Flatten
from tensorflow.keras.utils import to_categorical

nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[\w]+', '', text)  # remove @mentions
    text = re.sub(r'#\w+', '', text)    # remove hashtags
    text = re.sub(r'http\S+|www\S+', '', text)  # remove URLs
    text = re.sub(r'\d+', '', text)     # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

# Load your data here
df = pd.read_csv('tweets.csv')  # Replace this with the uploaded filename
df['text_clean'] = df['text'].apply(clean_text)

X = df['text_clean']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# CountVectorizer
count = CountVectorizer(max_features=5000)
X_train_count = count.fit_transform(X_train)
X_test_count = count.transform(X_test)

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)

# SVM
svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)

print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))

print("SVM:")
print(classification_report(y_test, y_pred_svm))
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

vocab_size = len(tokenizer.word_index) + 1
model_mlp = Sequential([
    Embedding(vocab_size, 100, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model_mlp.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_mlp.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)

y_pred_mlp = model_mlp.predict(X_test_pad) > 0.5
print("MLP:")
print(classification_report(y_test, y_pred_mlp,zero_division=0))
model_cnn = Sequential([
    Embedding(vocab_size, 100, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)

y_pred_cnn = model_cnn.predict(X_test_pad) > 0.5
print("CNN:")
print(classification_report(y_test, y_pred_cnn,zero_division=0))
model_lstm = Sequential([
    Embedding(vocab_size, 100, input_length=max_len),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)

y_pred_lstm = model_lstm.predict(X_test_pad) > 0.5
print("LSTM:")
print(classification_report(y_test, y_pred_lstm,zero_division=0))
def summarize_model(name, y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": report['1']['precision'],
        "Recall": report['1']['recall'],
        "F1-Score": report['1']['f1-score'],
    }

results = []
results.append(summarize_model("Logistic Regression (TF-IDF)", y_test, y_pred_lr))
results.append(summarize_model("SVM (TF-IDF)", y_test, y_pred_svm))
results.append(summarize_model("MLP (Embeddings)", y_test, y_pred_mlp))
results.append(summarize_model("CNN (Embeddings)", y_test, y_pred_cnn))
results.append(summarize_model("LSTM (Embeddings)", y_test, y_pred_lstm))

results_df = pd.DataFrame(results)
print(results_df)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Logistic Regression:
              precision    recall  f1-score   support

           0       0.89      0.99      0.93      1878
           1       0.87      0.40      0.55       396

    accuracy                           0.89      2274
   macro avg       0.88      0.70      0.74      2274
weighted avg       0.88      0.89      0.87      2274

SVM:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      1878
           1       0.91      0.45      0.61       396

    accuracy                           0.90      2274
   macro avg       0.90      0.72      0.77      2274
weighted avg       0.90      0.90      0.88      2274

Epoch 1/5




[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 28ms/step - accuracy: 0.7960 - loss: 0.5167 - val_accuracy: 0.8121 - val_loss: 0.4786
Epoch 2/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - accuracy: 0.8101 - loss: 0.4938 - val_accuracy: 0.8121 - val_loss: 0.4761
Epoch 3/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 30ms/step - accuracy: 0.8152 - loss: 0.4784 - val_accuracy: 0.8121 - val_loss: 0.4673
Epoch 4/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - accuracy: 0.8092 - loss: 0.4716 - val_accuracy: 0.8132 - val_loss: 0.4288
Epoch 5/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - accuracy: 0.8252 - loss: 0.4156 - val_accuracy: 0.8253 - val_loss: 0.4012
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
MLP:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91      1878
          



[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 68ms/step - accuracy: 0.8000 - loss: 0.4705 - val_accuracy: 0.8665 - val_loss: 0.3548
Epoch 2/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 68ms/step - accuracy: 0.9231 - loss: 0.2384 - val_accuracy: 0.8786 - val_loss: 0.3592
Epoch 3/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 69ms/step - accuracy: 0.9688 - loss: 0.1483 - val_accuracy: 0.8852 - val_loss: 0.4526
Epoch 4/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 68ms/step - accuracy: 0.9839 - loss: 0.1142 - val_accuracy: 0.8874 - val_loss: 0.5984
Epoch 5/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 68ms/step - accuracy: 0.9862 - loss: 0.0988 - val_accuracy: 0.8698 - val_loss: 0.5560
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
CNN:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1878
       



[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 150ms/step - accuracy: 0.7985 - loss: 0.5069 - val_accuracy: 0.8121 - val_loss: 0.4833
Epoch 2/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 155ms/step - accuracy: 0.8111 - loss: 0.4861 - val_accuracy: 0.8121 - val_loss: 0.4842
Epoch 3/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 147ms/step - accuracy: 0.8126 - loss: 0.4843 - val_accuracy: 0.8121 - val_loss: 0.4864
Epoch 4/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 151ms/step - accuracy: 0.8052 - loss: 0.4947 - val_accuracy: 0.8121 - val_loss: 0.4843
Epoch 5/5
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 155ms/step - accuracy: 0.8133 - loss: 0.4820 - val_accuracy: 0.8121 - val_loss: 0.4833
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step
LSTM:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90      1878
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
