In [38]:
import pandas as pd
import numpy as np

In [39]:
df = pd.read_csv('news.csv')

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [42]:
df.shape

(6335, 4)

In [43]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [44]:
df['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [45]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
REAL,3171
FAKE,3164


### Data Preprocessing

In [46]:
# prompt: apply label encoding on sentiment colunm

from sklearn.preprocessing import LabelEncoder
import pickle

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])
df['label'].value_counts()



with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)



In [47]:
df['title'] = df['title'].str.lower()
df['text'] = df['text'].str.lower()

In [48]:

from sklearn.model_selection import train_test_split

X = df[['title','text']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [49]:

from sklearn.preprocessing import LabelEncoder
import re
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Removing special characters, URLs, HTML tags, and extra spaces
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
    text = re.sub(r'(http|https|ftp)://[a-zA-Z0-9./]+', '', text)
    text = BeautifulSoup(text, 'lxml').get_text()
    text = " ".join(text.split())
    return text

# Apply preprocessing to both 'text' and 'title' columns
for col in ['text', 'title']:
    X_train[col] = X_train[col].apply(preprocess_text)
    X_test[col] = X_test[col].apply(preprocess_text)

# Remove stopwords from both 'text' and 'title' columns
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

for col in ['text', 'title']:
    X_train[col] = X_train[col].apply(remove_stopwords)
    X_test[col] = X_test[col].apply(remove_stopwords)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


X_train shape: (5068, 2)
y_train shape: (5068,)
X_test shape: (1267, 2)
y_test shape: (1267,)


In [50]:
# prompt: do tokenization and apply lemmitization

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
from sklearn.model_selection import train_test_split
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')


# Tokenization and Lemmatization
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

for col in ['text', 'title']:
    X_train[col] = X_train[col].apply(tokenize_and_lemmatize)
    X_test[col] = X_test[col].apply(tokenize_and_lemmatize)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


X_train shape: (5068, 2)
y_train shape: (5068,)
X_test shape: (1267, 2)
y_test shape: (1267,)


In [51]:
# prompt: apply tfidf according to i=our file

from sklearn.feature_extraction.text import TfidfVectorizer

# Combine 'title' and 'text' columns for TF-IDF
X_train['combined'] = X_train['title'] + ' ' + X_train['text']
X_test['combined'] = X_test['title'] + ' ' + X_test['text']

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Adjust max_features as needed

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['combined'])

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined'])

print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_test_tfidf shape: {X_test_tfidf.shape}")


# Save vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)


X_train_tfidf shape: (5068, 5000)
X_test_tfidf shape: (1267, 5000)


In [52]:
# prompt: install keras

!pip install keras




In [53]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# models = {
#     'Naive Bayes': MultinomialNB(),
#     'Random Forest': RandomForestClassifier(),
# }

# for model_name, model in models.items():
#     model.fit(X_train_tfidf, y_train)
#     y_train_pred = model.predict(X_train_tfidf)
#     y_test_pred = model.predict(X_test_tfidf)

#     print(model_name)
#     print('Training Set Performance')
#     print('accuracy_score {:.4f}'.format(accuracy_score(y_train, y_train_pred)))
#     print('f1_score {:.4f}'.format(f1_score(y_train, y_train_pred, average='weighted')))
#     print('precision_score {:.4f}'.format(precision_score(y_train, y_train_pred, average='weighted')))
#     print('recall_score {:.4f}'.format(recall_score(y_train, y_train_pred, average='weighted')))
#     print('roc_auc_score {:.4f}'.format(roc_auc_score(y_train, y_train_pred, average='weighted')))
#     print('--------------------------')
#     print('Test Set Performance')
#     print('accuracy_score {:.4f}'.format(accuracy_score(y_test, y_test_pred)))
#     print('f1_score {:.4f}'.format(f1_score(y_test, y_test_pred, average='weighted')))
#     print('precision_score {:.4f}'.format(precision_score(y_test, y_test_pred, average='weighted')))
#     print('recall_score {:.4f}'.format(recall_score(y_test, y_test_pred, average='weighted')))
#     print('roc_auc_score {:.4f}'.format(roc_auc_score(y_test, y_test_pred, average='weighted')))
#     print('=' * 35)
#     print('\n')

#     filename = f'{model_name.lower().replace(" ", "_")}_model.pkl'
#     with open(filename, 'wb') as f:
#         pickle.dump(model, f)

# # what is LSTM
# # LSTM Model
# # Tokenize the text data
# tokenizer = Tokenizer(num_words=5000) # Adjust num_words as needed
# tokenizer.fit_on_texts(X_train['combined'])

# X_train_seq = tokenizer.texts_to_sequences(X_train['combined'])
# X_test_seq = tokenizer.texts_to_sequences(X_test['combined'])

# max_length = 200  # Adjust max_length as needed
# X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
# X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

# # Define the LSTM model
# model = Sequential()
# model.add(Embedding(5000, 128, input_length=max_length))
# model.add(LSTM(64))
# model.add(Dense(1, activation='sigmoid')) # Assuming binary classification

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.fit(X_train_pad, y_train, epochs=5, batch_size=64) # adjust epochs and batch_size

# y_train_pred_lstm = (model.predict(X_train_pad) > 0.5).astype(int)
# y_test_pred_lstm = (model.predict(X_test_pad) > 0.5).astype(int)


# # Evaluate LSTM model
# print('LSTM')
# # ... (similar evaluation code for LSTM using y_train_pred_lstm and y_test_pred_lstm)
# # save the tokenizer
# with open('tokenizer.pkl', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# model.save('lstm_model.h5')


In [54]:
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize variables to track the best model and its performance
best_model = None
best_f1_score = 0
best_model_name = ""

# Traditional models (Naive Bayes, Random Forest)
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
}

# Loop through the models for training and evaluation
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_train_pred = model.predict(X_train_tfidf)
    y_test_pred = model.predict(X_test_tfidf)

    print(model_name)
    print('Training Set Performance')
    print('accuracy_score {:.4f}'.format(accuracy_score(y_train, y_train_pred)))
    print('f1_score {:.4f}'.format(f1_score(y_train, y_train_pred, average='weighted')))
    print('precision_score {:.4f}'.format(precision_score(y_train, y_train_pred, average='weighted')))
    print('recall_score {:.4f}'.format(recall_score(y_train, y_train_pred, average='weighted')))
    print('roc_auc_score {:.4f}'.format(roc_auc_score(y_train, y_train_pred, average='weighted')))
    print('--------------------------')
    print('Test Set Performance')
    print('accuracy_score {:.4f}'.format(accuracy_score(y_test, y_test_pred)))
    print('f1_score {:.4f}'.format(f1_score(y_test, y_test_pred, average='weighted')))
    print('precision_score {:.4f}'.format(precision_score(y_test, y_test_pred, average='weighted')))
    print('recall_score {:.4f}'.format(recall_score(y_test, y_test_pred, average='weighted')))
    print('roc_auc_score {:.4f}'.format(roc_auc_score(y_test, y_test_pred, average='weighted')))
    print('=' * 35)
    print('\n')

    # Check if this model has the best F1 score so far
    model_f1_score = f1_score(y_test, y_test_pred, average='weighted')
    if model_f1_score > best_f1_score:
        best_f1_score = model_f1_score
        best_model = model
        best_model_name = model_name

    # Save each model
    filename = f'{model_name.lower().replace(" ", "_")}_model.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

# LSTM Model
# Tokenize the text data for LSTM model
tokenizer = Tokenizer(num_words=5000) # Adjust num_words as needed
tokenizer.fit_on_texts(X_train['combined'])

X_train_seq = tokenizer.texts_to_sequences(X_train['combined'])
X_test_seq = tokenizer.texts_to_sequences(X_test['combined'])

max_length = 200  # Adjust max_length as needed
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

# Define the LSTM model without 'input_length'
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128))  # Removed input_length
lstm_model.add(LSTM(64))
lstm_model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64)  # adjust epochs and batch_size

# Predict using LSTM
y_train_pred_lstm = (lstm_model.predict(X_train_pad) > 0.5).astype(int)
y_test_pred_lstm = (lstm_model.predict(X_test_pad) > 0.5).astype(int)

# Evaluate LSTM model
print('LSTM')
print('Training Set Performance')
print('accuracy_score {:.4f}'.format(accuracy_score(y_train, y_train_pred_lstm)))
print('f1_score {:.4f}'.format(f1_score(y_train, y_train_pred_lstm, average='weighted')))
print('precision_score {:.4f}'.format(precision_score(y_train, y_train_pred_lstm, average='weighted')))
print('recall_score {:.4f}'.format(recall_score(y_train, y_train_pred_lstm, average='weighted')))
print('roc_auc_score {:.4f}'.format(roc_auc_score(y_train, y_train_pred_lstm, average='weighted')))
print('--------------------------')
print('Test Set Performance')
print('accuracy_score {:.4f}'.format(accuracy_score(y_test, y_test_pred_lstm)))
print('f1_score {:.4f}'.format(f1_score(y_test, y_test_pred_lstm, average='weighted')))
print('precision_score {:.4f}'.format(precision_score(y_test, y_test_pred_lstm, average='weighted')))
print('recall_score {:.4f}'.format(recall_score(y_test, y_test_pred_lstm, average='weighted')))
print('roc_auc_score {:.4f}'.format(roc_auc_score(y_test, y_test_pred_lstm, average='weighted')))
print('=' * 35)
print('\n')

# Save the tokenizer and LSTM model
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

lstm_model.save('lstm_model.h5')

# Update best model if LSTM has higher F1 score
lstm_f1_score = f1_score(y_test, y_test_pred_lstm, average='weighted')
if lstm_f1_score > best_f1_score:
    best_f1_score = lstm_f1_score
    best_model = lstm_model
    best_model_name = 'LSTM'

# Save the best model
if best_model_name != 'LSTM':
    with open(f'{best_model_name.lower().replace(" ", "_")}_best_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
else:
    best_model.save('best_lstm_model.h5')

print(f"Best model is {best_model_name} with F1 score: {best_f1_score}")


Naive Bayes
Training Set Performance
accuracy_score 0.9033
f1_score 0.9033
precision_score 0.9035
recall_score 0.9033
roc_auc_score 0.9033
--------------------------
Test Set Performance
accuracy_score 0.8895
f1_score 0.8895
precision_score 0.8897
recall_score 0.8895
roc_auc_score 0.8896


Random Forest
Training Set Performance
accuracy_score 1.0000
f1_score 1.0000
precision_score 1.0000
recall_score 1.0000
roc_auc_score 1.0000
--------------------------
Test Set Performance
accuracy_score 0.9219
f1_score 0.9219
precision_score 0.9219
recall_score 0.9219
roc_auc_score 0.9218


Epoch 1/5
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 143ms/step - accuracy: 0.6965 - loss: 0.5940
Epoch 2/5
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 144ms/step - accuracy: 0.9087 - loss: 0.2416
Epoch 3/5
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 144ms/step - accuracy: 0.9520 - loss: 0.1562
Epoch 4/5
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3



LSTM
Training Set Performance
accuracy_score 0.9935
f1_score 0.9935
precision_score 0.9935
recall_score 0.9935
roc_auc_score 0.9935
--------------------------
Test Set Performance
accuracy_score 0.8966
f1_score 0.8965
precision_score 0.8978
recall_score 0.8966
roc_auc_score 0.8964


Best model is Random Forest with F1 score: 0.9218587731034075


In [55]:
# import streamlit as st
# import pickle
# import re
# from bs4 import BeautifulSoup
# from nltk.corpus import stopwords
# import nltk

# # Download stopwords
# nltk.download('stopwords')

# # Load resources
# with open("logistic_regression_model.pkl", "rb") as model_file:
#     model = pickle.load(model_file)

# with open("tfidf_vectorizer.pkl", "rb") as vec_file:
#     vectorizer = pickle.load(vec_file)

# with open("label_encoder.pkl", "rb") as le_file:
#     label_encoder = pickle.load(le_file)

# # Preprocessing function
# def preprocess(text):
#     text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
#     text = re.sub(r'(http|https|ftp)://[a-zA-Z0-9./]+', '', text)
#     text = BeautifulSoup(text, 'lxml').get_text()
#     text = " ".join(text.split())
#     text = " ".join([word for word in text.lower().split() if word not in stopwords.words('english')])
#     return text

# # App UI config
# st.set_page_config(page_title="🎬 Movie Sentiment Analyzer", layout="centered")

# st.markdown(
#     """
#     <style>
#     .main {
#         background-color: #f9f9f9;
#         font-family: 'Segoe UI', sans-serif;
#     }
#     .title {
#         color: #1f77b4;
#         text-align: center;
#     }
#     .footer {
#         text-align: center;
#         font-size: 12px;
#         color: #888;
#         margin-top: 50px;
#     }
#     </style>
#     """,
#     unsafe_allow_html=True
# )

# # Title
# st.markdown("<h1 class='title'>🎥 Movie Review Sentiment Analyzer</h1>", unsafe_allow_html=True)
# st.write("Write a review for your favorite movie and check if the sentiment is **Positive** or **Negative**!")

# # Movie selection
# movies = [
#     "Inception", "Titanic", "Interstellar", "The Godfather", "The Dark Knight",
#     "Forrest Gump", "The Shawshank Redemption", "Fight Club", "Avengers: Endgame", "Joker"
# ]
# selected_movie = st.selectbox("🎬 Select a Movie", movies)

# # Review input
# user_review = st.text_area(f"📝 Write your review for *{selected_movie}*", height=200)

# # Predict button
# if st.button("🔍 Analyze Sentiment"):
#     if user_review.strip() == "":
#         st.warning("🚨 Please enter a review before analyzing.")
#     else:
#         cleaned_review = preprocess(user_review)
#         vectorized_review = vectorizer.transform([cleaned_review])
#         prediction_encoded = model.predict(vectorized_review)[0]
#         prediction_label = label_encoder.inverse_transform([prediction_encoded])[0] if hasattr(label_encoder, "inverse_transform") else prediction_encoded

#         if prediction_label == 'pos':
#             st.success("✅ Positive Sentiment! You seem to have liked the movie. 🎉")
#         else:
#             st.error("❌ Negative Sentiment! You didn’t enjoy the movie much. 😢")

# # Footer
# st.markdown("<div class='footer'>Made with ❤️ using Streamlit</div>", unsafe_allow_html=True)


In [56]:
# This code is a complete machine learning workflow that compares traditional ML models (`Naive Bayes`, `Random Forest`) with a deep learning model (`LSTM`) for **text classification** (probably sentiment analysis or similar task). Here's a **clear step-by-step explanation**:

# ---

# ## 🔶 1. **Importing Required Libraries**

# ```python
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# ```

# * **scikit-learn**: For traditional ML models and evaluation metrics.
# * **Keras**: For defining and training the LSTM deep learning model.
# * **Tokenizer & Pad Sequences**: Converts raw text into numerical format for LSTM.

# ---

# ## 🔶 2. **Traditional Models: Naive Bayes & Random Forest**

# ```python
# models = {
#     'Naive Bayes': MultinomialNB(),
#     'Random Forest': RandomForestClassifier(),
# }
# ```

# Defines two models to be trained on TF-IDF vectorized text data.

# ---

# ### 🔹 Loop Over Each Model

# ```python
# for model_name, model in models.items():
#     model.fit(X_train_tfidf, y_train)
# ```

# * Fits each model on **TF-IDF transformed training data**.
# * Predicts on both training and test sets.

# ### 🔹 Evaluate Each Model

# ```python
#     print('accuracy_score', ...)
#     print('f1_score', ...)
# ```

# * Evaluates using **accuracy, precision, recall, F1-score, ROC AUC** (weighted to handle class imbalance).
# * Results are printed separately for **training** and **test** sets.

# ### 🔹 Save Each Model

# ```python
#     filename = f'{model_name.lower().replace(" ", "_")}_model.pkl'
#     with open(filename, 'wb') as f:
#         pickle.dump(model, f)
# ```

# * Saves each trained model as a `.pkl` file for later use or deployment.

# ---

# ## 🔶 3. **LSTM Deep Learning Model**

# ### 🔹 Tokenize Text Data

# ```python
# tokenizer = Tokenizer(num_words=5000)
# tokenizer.fit_on_texts(X_train['combined'])
# ```

# * Converts words into integers.
# * `X_train['combined']` is a column where text is already preprocessed and combined (e.g., `title + review`).

# ### 🔹 Convert Texts to Sequences and Pad

# ```python
# X_train_seq = tokenizer.texts_to_sequences(X_train['combined'])
# X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
# ```

# * Converts tokenized texts to equal-length padded sequences for LSTM input.

# ---

# ### 🔹 Define & Train LSTM Model

# ```python
# model = Sequential()
# model.add(Embedding(5000, 128, input_length=max_length))
# model.add(LSTM(64))
# model.add(Dense(1, activation='sigmoid'))
# ```

# * `Embedding`: Converts token IDs into dense vectors.
# * `LSTM`: Learns sequential patterns.
# * `Dense`: Final layer for **binary classification** using sigmoid.

# ### 🔹 Compile and Fit

# ```python
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.fit(X_train_pad, y_train, epochs=5, batch_size=64)
# ```

# * Trains the model for 5 epochs.

# ---

# ### 🔹 Predict and Evaluate

# ```python
# y_train_pred_lstm = (model.predict(X_train_pad) > 0.5).astype(int)
# ```

# * Predictions are probabilities → convert to class `0` or `1`.
# * You can then evaluate using the same metrics (not shown here).

# ---

# ### 🔹 Save LSTM Artifacts

# ```python
# with open('tokenizer.pkl', 'wb') as handle:
#     pickle.dump(tokenizer, handle)

# model.save('lstm_model.h5')
# ```

# * Saves the tokenizer and the LSTM model (`.h5`) for deployment in Streamlit or Flask.

# ---

# ## ✅ Summary

# | Component                      | Purpose                                    |
# | ------------------------------ | ------------------------------------------ |
# | `Naive Bayes`, `Random Forest` | Compare baseline ML classifiers            |
# | `Tokenizer`, `LSTM`            | Build an advanced deep learning classifier |
# | `TF-IDF` + `.pkl`              | Used with traditional models               |
# | `Tokenizer` + `.h5`            | Used with LSTM                             |
# | Evaluation Metrics             | To compare and track model performance     |
# | Pickling                       | Makes models deployable                    |

# ---

# Would you like me to also add the missing **evaluation block for the LSTM part** or prepare this for **Streamlit deployment**?
