In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# --- Preprocessing Function (from Day 8 - ensure NLTK data is downloaded) ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return " ".join(lemmatized_tokens)

# 1. Sample Dataset (simple sentiment analysis example)
# Features (X): text messages
# Labels (y): 'positive' or 'negative'
data = {
    'text': [
        "This movie is fantastic and I loved it!",
        "What a terrible film, absolutely hated it.",
        "The food was good, but the service was slow.",
        "Excellent experience, highly recommend.",
        "I regret wasting my time on this boring show.",
        "Happy with the purchase, great product.",
        "Worst customer service ever, very disappointed.",
        "A surprisingly good story, well written."
    ],
    'sentiment': [
        'positive', 'negative', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive'
    ]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Apply preprocessing to the text column
df['processed_text'] = df['text'].apply(preprocess_text)
print("\nDataFrame with Preprocessed Text:")
print(df)

# Define X (features) and y (labels)
X = df['processed_text']
y = df['sentiment']

# 2. Text Representation using TF-IDF (from Day 9)
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)
print(f"\nShape of TF-IDF matrix (num_samples, num_features): {X_tfidf.shape}")

# 3. Train-Test Split
# test_size=0.3 means 30% of data for testing, random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42, stratify=y)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Training labels distribution:\n{y_train.value_counts()}")
print(f"Test labels distribution:\n{y_test.value_counts()}")

Original DataFrame:
                                              text sentiment
0          This movie is fantastic and I loved it!  positive
1       What a terrible film, absolutely hated it.  negative
2     The food was good, but the service was slow.  negative
3          Excellent experience, highly recommend.  positive
4    I regret wasting my time on this boring show.  negative
5          Happy with the purchase, great product.  positive
6  Worst customer service ever, very disappointed.  negative
7         A surprisingly good story, well written.  positive

DataFrame with Preprocessed Text:
                                              text sentiment  \
0          This movie is fantastic and I loved it!  positive   
1       What a terrible film, absolutely hated it.  negative   
2     The food was good, but the service was slow.  negative   
3          Excellent experience, highly recommend.  positive   
4    I regret wasting my time on this boring show.  negative   
5          H

In [3]:
print("\n--- Training and Evaluating Logistic Regression ---")

# 1. Initialize the Logistic Regression Model
model_lr = LogisticRegression(random_state=42, solver='liblinear') # 'liblinear' is good for small datasets

# 2. Train the model on the training data
model_lr.fit(X_train, y_train)
print("Logistic Regression Model Trained.")

# 3. Make predictions on the test data
y_pred_lr = model_lr.predict(X_test)
print(f"\nSample Predictions (first 3): {y_pred_lr[:3]}")
print(f"Actual Labels (first 3): {list(y_test)[:3]}") # Convert y_test to list for easy viewing

# 4. Evaluate the model's performance
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"\nAccuracy of Logistic Regression: {accuracy_lr:.2f}")

# Classification Report provides precision, recall, and f1-score for each class
# Precision: Out of all predicted positives, how many were actually positive?
# Recall: Out of all actual positives, how many did the model correctly identify?
# F1-Score: Harmonic mean of precision and recall (good balance metric)
# Support: Number of actual occurrences of the class in the specified dataset
print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_lr))


--- Training and Evaluating Logistic Regression ---
Logistic Regression Model Trained.

Sample Predictions (first 3): ['positive' 'positive' 'positive']
Actual Labels (first 3): ['negative', 'negative', 'positive']

Accuracy of Logistic Regression: 0.33

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         2
    positive       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
from sklearn.svm import SVC

print("\n--- Training and Evaluating Support Vector Machine ---")

# 1. Initialize the Support Vector Machine Model
# 'kernel' can be 'linear', 'rbf' (radial basis function), etc. 'linear' is often good for text data.
model_svm = SVC(kernel='linear', random_state=42)

# 2. Train the model on the training data
model_svm.fit(X_train, y_train)
print("SVM Model Trained.")

# 3. Make predictions on the test data
y_pred_svm = model_svm.predict(X_test)
print(f"\nSample Predictions (first 3): {y_pred_svm[:3]}")
print(f"Actual Labels (first 3): {list(y_test)[:3]}")

# 4. Evaluate the model's performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"\nAccuracy of SVM: {accuracy_svm:.2f}")

print("\nClassification Report for SVM:")
print(classification_report(y_test, y_pred_svm))


--- Training and Evaluating Support Vector Machine ---
SVM Model Trained.

Sample Predictions (first 3): ['positive' 'positive' 'positive']
Actual Labels (first 3): ['negative', 'negative', 'positive']

Accuracy of SVM: 0.33

Classification Report for SVM:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         2
    positive       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
print("\n--- Predicting on New Input Text ---")

new_text = "This product is amazing, I am extremely happy!"
new_text_negative = "What a terrible day, everything went wrong."

# 1. Preprocess the new text (using the same preprocess_text function)
processed_new_text = preprocess_text(new_text)
processed_new_text_negative = preprocess_text(new_text_negative)
print(f"\nOriginal New Text (Positive): '{new_text}'")
print(f"Preprocessed New Text (Positive): '{processed_new_text}'")

print(f"\nOriginal New Text (Negative): '{new_text_negative}'")
print(f"Preprocessed New Text (Negative): '{processed_new_text_negative}'")

# 2. Transform the preprocessed new text using the *same* fitted TF-IDF Vectorizer
# IMPORTANT: Do not call .fit_transform() again, just .transform()
# because the vocabulary and IDF weights were learned from the training data.
new_text_tfidf = tfidf_vectorizer.transform([processed_new_text])
new_text_negative_tfidf = tfidf_vectorizer.transform([processed_new_text_negative])

print(f"\nShape of TF-IDF for new text: {new_text_tfidf.shape}")

# 3. Make predictions with Logistic Regression
prediction_lr_positive = model_lr.predict(new_text_tfidf)
print(f"Logistic Regression Prediction for positive text: {prediction_lr_positive[0]}")

prediction_lr_negative = model_lr.predict(new_text_negative_tfidf)
print(f"Logistic Regression Prediction for negative text: {prediction_lr_negative[0]}")

# 4. Make predictions with SVM
prediction_svm_positive = model_svm.predict(new_text_tfidf)
print(f"SVM Prediction for positive text: {prediction_svm_positive[0]}")

prediction_svm_negative = model_svm.predict(new_text_negative_tfidf)
print(f"SVM Prediction for negative text: {prediction_svm_negative[0]}")


--- Predicting on New Input Text ---

Original New Text (Positive): 'This product is amazing, I am extremely happy!'
Preprocessed New Text (Positive): 'product amazing extremely happy'

Original New Text (Negative): 'What a terrible day, everything went wrong.'
Preprocessed New Text (Negative): 'terrible day everything went wrong'

Shape of TF-IDF for new text: (1, 32)
Logistic Regression Prediction for positive text: positive
Logistic Regression Prediction for negative text: positive
SVM Prediction for positive text: positive
SVM Prediction for negative text: positive
