# Supervised Learning

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression     # ← Correct import!
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
import warnings
warnings.filterwarnings('ignore')

# Load your data
file_path = '/content/drive/My Drive/IST 332 NLP Final Project/urgent_care_reviews_CA_fully_preprocessed.csv'
df = pd.read_csv(file_path)

# Create labels: 4–5 stars = Positive (1), 1–3 stars = Negative (0)
df['sentiment'] = (df['review_rating'] >= 4).astype(int)

X = df['review_cleaned_text'].fillna('')
y = df['sentiment']

print(f"Total reviews: {len(df)}")
print(f"Positive reviews (4-5 stars): {y.mean():.2%}")
print(f"Negative reviews (1-3 stars): {(1-y.mean()):.2%}")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# We'll run only 4 strong models → finishes fast but still meets all requirements
results = {}

# 1. TF-IDF + Logistic Regression (usually the best)
print("\n1/4 Running TF-IDF + Logistic Regression...")
pipe1 = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=15000, ngram_range=(1,2))),
    ('lr', LogisticRegression(max_iter=1000, solver='saga'))
])
grid1 = GridSearchCV(pipe1, {'lr__C': [0.1, 1, 10]}, cv=3, scoring='f1_weighted', n_jobs=-1)
grid1.fit(X_train, y_train)
score1 = f1_score(y_test, grid1.predict(X_test), average='weighted')
results['TF-IDF + LR'] = score1

# 2. TF-IDF + SVD + LR
print("2/4 Running TF-IDF + SVD + LR...")
pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=15000, ngram_range=(1,2))),
    ('svd', TruncatedSVD(n_components=500, random_state=42)),
    ('lr', LogisticRegression(max_iter=1000))
])
grid2 = GridSearchCV(pipe2, {'lr__C': [0.1, 1, 10]}, cv=3, scoring='f1_weighted', n_jobs=-1)
grid2.fit(X_train, y_train)
results['TF-IDF + SVD + LR'] = f1_score(y_test, grid2.predict(X_test), average='weighted')

# 3. Bag-of-Words + LR
print("3/4 Running BoW + Logistic Regression...")
pipe3 = Pipeline([
    ('bow', CountVectorizer(max_features=15000, ngram_range=(1,2))),
    ('lr', LogisticRegression(max_iter=1000, solver='saga'))
])
grid3 = GridSearchCV(pipe3, {'lr__C': [0.1, 1, 10]}, cv=3, scoring='f1_weighted', n_jobs=-1)
grid3.fit(X_train, y_train)
results['BoW + LR'] = f1_score(y_test, grid3.predict(X_test), average='weighted')

# 4. BoW + SVD + LR
print("4/4 Running BoW + SVD + LR...")
pipe4 = Pipeline([
    ('bow', CountVectorizer(max_features=15000, ngram_range=(1,2))),
    ('svd', TruncatedSVD(n_components=500, random_state=42)),
    ('lr', LogisticRegression(max_iter=1000))
])
grid4 = GridSearchCV(pipe4, {'lr__C': [0.1, 1, 10]}, cv=3, scoring='f1_weighted', n_jobs=-1)
grid4.fit(X_train, y_train)
results['BoW + SVD + LR'] = f1_score(y_test, grid4.predict(X_test), average='weighted')

# === FINAL RESULTS ===
print("\n" + "="*65)
print("FINAL PERFORMANCE COMPARISON (Weighted F1-Score on Test Set)")
print("="*65)
for name, score in results.items():
    print(f"{name:28} → {score:.4f}")

best_name = max(results, key=results.get)
best_grid = [grid1, grid2, grid3, grid4][list(results.keys()).index(best_name)]

print("\n" + "="*65)
print(f"BEST MODEL: {best_name}")
print(f"Weighted F1-Score: {results[best_name]:.4f}")
print(f"Best parameters: {best_grid.best_params_}")
print("="*65)

print("\nDetailed Classification Report (Test Set):")
print(classification_report(y_test, best_grid.predict(X_test)))

print("\nRECOMMENDATION:")
print(f"After systematically comparing Bag-of-Words and TF-IDF representations,\n"
      f"with and without Truncated SVD dimensionality reduction,\n"
      f"the best model is '{best_name}' with a weighted F1-score of {results[best_name]:.4f}.\n"
      f"This model demonstrates superior generalization performance on unseen data\n"
      f"and is strongly recommended as the final sentiment classifier for California urgent care reviews.")

Mounted at /content/drive
Total reviews: 11658
Positive reviews (4-5 stars): 61.73%
Negative reviews (1-3 stars): 38.27%

1/4 Running TF-IDF + Logistic Regression...
2/4 Running TF-IDF + SVD + LR...
3/4 Running BoW + Logistic Regression...
4/4 Running BoW + SVD + LR...

FINAL PERFORMANCE COMPARISON (Weighted F1-Score on Test Set)
TF-IDF + LR                  → 0.9510
TF-IDF + SVD + LR            → 0.9451
BoW + LR                     → 0.9436
BoW + SVD + LR               → 0.9355

BEST MODEL: TF-IDF + LR
Weighted F1-Score: 0.9510
Best parameters: {'lr__C': 10}

Detailed Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       893
           1       0.95      0.97      0.96      1439

    accuracy                           0.95      2332
   macro avg       0.95      0.95      0.95      2332
weighted avg       0.95      0.95      0.95      2332


RECOMMENDATION:
After systematically comparing Bag-of-Words an

Extract the coefficient for top 10 words for each topic

In [None]:
best_tfidf_lr = grid1.best_estimator_

tfidf = best_tfidf_lr.named_steps['tfidf']
lr = best_tfidf_lr.named_steps['lr']

feature_names = np.array(tfidf.get_feature_names_out())
coefs = lr.coef_[0]   # binary LR → shape (n_features,)

In [None]:
import pandas as pd

def _get_vectorizer(tfidf_or_pipeline):
    """
    Return the underlying TfidfVectorizer whether we pass
    the vectorizer directly or a Pipeline containing it.
    """
    # Case 1: it's already a TfidfVectorizer
    if hasattr(tfidf_or_pipeline, "vocabulary_"):
        return tfidf_or_pipeline

    # Case 2: it's a Pipeline with named_steps
    if hasattr(tfidf_or_pipeline, "named_steps"):
        for name, step in tfidf_or_pipeline.named_steps.items():
            if hasattr(step, "vocabulary_"):
                return step

    raise ValueError("Could not find a TfidfVectorizer with a 'vocabulary_' attribute.")


def get_word_coef(word, tfidf_or_pipeline, coefs):
    """Return TF-IDF coefficient for a single word."""
    vec = _get_vectorizer(tfidf_or_pipeline)

    if word in vec.vocabulary_:
        idx = vec.vocabulary_[word]
        return coefs[idx]
    else:
        return None  # keep numeric for the table


def build_topic_tfidf_table(topic_lists, tfidf_or_pipeline, coefs):
    """
    topic_lists: dict like
        {
          "Topic 1": [...],
          "Topic 2": [...],
          "Topic 3": [...]
        }
    Returns a DataFrame with Topic, Word, TFIDF_Coefficient.
    """
    records = []

    for topic_name, word_list in topic_lists.items():
        for word in word_list:
            coef = get_word_coef(word, tfidf_or_pipeline, coefs)
            records.append({
                "Topic": topic_name,
                "Word": word,
                "TFIDF_Coefficient": coef
            })

    df = pd.DataFrame(records)
    return df


In [None]:
topic_lists = {
    "Topic 1": [
        "care", "doctor", "experience", "time", "great",
        "recommend", "medical", "take", "professional", "patient"
    ],
    "Topic 2": [
        "doctor", "go", "tell", "get", "say",
        "call", "would", "patient", "ask", "give"
    ],
    "Topic 3": [
        "wait", "time", "go", "get", "appointment",
        "see", "urgent_care", "doctor", "take", "hour"
    ]
}

summary_table = build_topic_tfidf_table(topic_lists, best_tfidf_lr, coefs)
summary_table


Unnamed: 0,Topic,Word,TFIDF_Coefficient
0,Topic 1,care,0.94831
1,Topic 1,doctor,3.29615
2,Topic 1,experience,1.240304
3,Topic 1,time,0.562026
4,Topic 1,great,11.622709
5,Topic 1,recommend,6.082824
6,Topic 1,medical,0.410912
7,Topic 1,take,0.646112
8,Topic 1,professional,6.506175
9,Topic 1,patient,-3.881471


In [None]:
import pandas as pd

df = pd.DataFrame(summary_table)

In [None]:
df.to_csv ('/content/drive/MyDrive/IST 332 NLP Final Project/top_word_coef.csv', index=False)