<a href="https://colab.research.google.com/github/sreehari31580/Nlp-tutorial/blob/main/TDF_IDF_and_Bag_of_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# 1. Import Required Libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [12]:

# 2. Load Dataset
text_data = pd.read_csv("/content/IMDB Dataset.csv")  # Change path if needed

# Check the data
print(text_data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [13]:

# 3. Extract Features and Labels
X = text_data['review']
y = text_data['sentiment'].map({'positive': 1, 'negative': 0})  # Convert to binary


In [14]:

# 4. Train/Test Split (for both BoW and TF-IDF)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:

# 5. Bag-of-Words Vectorization
bow_vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(X_train_raw)
X_test_bow = bow_vectorizer.transform(X_test_raw)

In [16]:

# 6. TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_raw)
X_test_tfidf = tfidf_vectorizer.transform(X_test_raw)

In [17]:
# 7. Train & Evaluate Model on Bag-of-Words
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train_bow, y_train)
y_pred_bow = clf_bow.predict(X_test_bow)
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print("\nBag-of-Words Accuracy:", accuracy_bow)
print("BoW Classification Report:\n", classification_report(y_test, y_pred_bow))



Bag-of-Words Accuracy: 0.87
BoW Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87      4961
           1       0.87      0.88      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [18]:
# 8. Train & Evaluate Model on TF-IDF
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("\nTF-IDF Accuracy:", accuracy_tfidf)
print("TF-IDF Classification Report:\n", classification_report(y_test, y_pred_tfidf))


TF-IDF Accuracy: 0.8889
TF-IDF Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [19]:
# 9. Performance Comparison
print("\n--- Comparison Summary ---")
print(f"Bag-of-Words Accuracy : {accuracy_bow:.4f}")
print(f"TF-IDF Accuracy       : {accuracy_tfidf:.4f}")


--- Comparison Summary ---
Bag-of-Words Accuracy : 0.8700
TF-IDF Accuracy       : 0.8889
