In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# CHANGED: Import LinearSVC instead of SVC for stability and efficiency
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import os
import sys
import warnings

# Suppress harmless user warnings and convergence warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)


# --- Configuration for MIND-small files (Adjust paths if necessary) ---
NEWS_FILE = 'news.tsv' # Provided file
BEHAVIORS_FILE = 'behaviors.tsv' # Provided file
NEWS_COLUMNS = ['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities']


# ===============================================
# A) Preprocessing (to be run once to generate features)
# ===============================================

# Check for required files
if not os.path.exists(NEWS_FILE) or not os.path.exists(BEHAVIORS_FILE):
    print("Error: Required files 'news.tsv' and 'behaviors.tsv' not found.")
    print("Please ensure they are in the same directory as the script.")
    sys.exit()

try:
    # 1. Load and sample data
    N_ROWS = 5000
    df_news = pd.read_csv(NEWS_FILE, sep='\t', names=NEWS_COLUMNS, encoding='utf-8', index_col=False)
    df_behaviors = pd.read_csv(BEHAVIORS_FILE, sep='\t', names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'], encoding='utf-8', index_col=False, nrows=N_ROWS)

    # 2. Create Interaction Dataset (UserID, NewsID, Label)
    interaction_data = []
    for index, row in df_behaviors.iterrows():
        for item in row['Impressions'].split(' '):
            try:
                news_id, label = item.split('-')
                interaction_data.append({'UserID': row['UserID'], 'NewsID': news_id, 'Label': int(label)})
            except ValueError:
                continue

    df_interactions = pd.DataFrame(interaction_data)
    if df_interactions.empty:
        print("Error: No valid interactions could be extracted. Exiting.")
        sys.exit()

    # 3. Content and Feature Preparation
    df_news['content'] = df_news['Title'].fillna('') + ' ' + df_news['Abstract'].fillna('')
    valid_news_ids = df_interactions['NewsID'].unique()
    df_news_filtered = df_news[df_news['NewsID'].isin(valid_news_ids)].copy()

    # 4. TF-IDF Features
    tfidf = TfidfVectorizer(stop_words='english', max_features=100)
    tfidf_matrix = tfidf.fit_transform(df_news_filtered['content']).toarray()
    tfidf_df = pd.DataFrame(tfidf_matrix, columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
    df_news_filtered.reset_index(drop=True, inplace=True)
    df_news_features = pd.concat([df_news_filtered[['NewsID', 'Category']], tfidf_df], axis=1)

    # 5. Categorical Features (One-Hot Encoding)
    df_category_ohe = pd.get_dummies(df_news_features['Category'], prefix='cat')
    df_news_features = pd.concat([df_news_features.drop('Category', axis=1), df_category_ohe], axis=1)

    # 6. Final Merge
    df_final = pd.merge(df_interactions, df_news_features, on='NewsID', how='inner')
    PROCESSED_FILE = 'processed_recommendation_data.csv'
    df_final.to_csv(PROCESSED_FILE, index=False)

except Exception as e:
    print(f"An error occurred during preprocessing: {e}")
    sys.exit()

print("✅ Preprocessing and Feature Generation Complete.")

# ===============================================
# B) LinearSVC Model Training and Prediction
# ===============================================

# Load processed data
df_final = pd.read_csv(PROCESSED_FILE)
df_news_all = pd.read_csv(NEWS_FILE, sep='\t', names=NEWS_COLUMNS, encoding='utf-8', index_col=False)

# Prepare X and y
feature_cols = [col for col in df_final.columns if col not in ['UserID', 'NewsID', 'Label']]
X = df_final[feature_cols]
y = df_final['Label']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and Train Linear Support Vector Classifier (LinearSVC)
# 'dual=False' is preferred when n_samples > n_features (often the case here)
# 'class_weight': 'balanced' addresses the class imbalance
svm_model = LinearSVC(random_state=42, class_weight='balanced', dual=False, max_iter=10000)
print("⏳ Training Linear Support Vector Machine (LinearSVC) Model...")
svm_model.fit(X_train, y_train)
print("✅ Model Training Complete.")


# --- Evaluation ---
y_pred = svm_model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n" + "="*50)
print("--- Model Evaluation (Click Prediction) ---")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score:    {recall:.4f}")
print(f"F1 Score:        {f1:.4f}")
print("="*50)


# --- Recommendation Function ---
df_news_features_cand = df_final.drop(['UserID', 'Label'], axis=1).drop_duplicates()
def recommend_news(model, user_id, df_interactions, df_news_features_cand, df_news_all, n=10):
    """Generates N recommendations for a user based on predicted click probability (or score)."""

    candidate_news_df = df_news_features_cand.copy()

    # LinearSVC does not have predict_proba, so we use the decision function score.
    # Higher score from decision_function means higher confidence in the positive class (click=1).
    X_pred = candidate_news_df.drop('NewsID', axis=1)
    probabilities_or_scores = model.decision_function(X_pred)
    candidate_news_df['score'] = probabilities_or_scores

    # Identify news the user has already seen (to filter them out)
    seen_news_ids = df_interactions[df_interactions['UserID'] == user_id]['NewsID'].unique()

    # Rank candidates, filter out seen news, and get top N
    top_recommendations = candidate_news_df[~candidate_news_df['NewsID'].isin(seen_news_ids)]
    top_recommendations = top_recommendations.sort_values(by='score', ascending=False).head(n)

    if top_recommendations.empty:
        return pd.DataFrame(columns=['NewsID', 'score', 'Title', 'Category'])

    # Merge with the original news DataFrame to get Title and Category for display
    rec_results = pd.merge(top_recommendations[['NewsID', 'score']],
                           df_news_all[['NewsID', 'Title', 'Category']],
                           on='NewsID', how='left')

    return rec_results


# ===============================================
# C) Interactive Recommendation UI (Console-based)
# ===============================================

def run_recommendation_ui(model, df_interactions, df_news_features_cand, df_news_all):

    unique_user_ids = df_interactions['UserID'].unique()

    print("\n" + "="*50)
    print("📰 Interactive News Recommendation System (LinearSVC) 📰")
    print("="*50)
    print(f"Available Users: {len(unique_user_ids)}")
    print(f"Example User IDs: {', '.join(unique_user_ids[:5])}...")

    # 1. Get User ID
    while True:
        user_input = input(f"\nEnter a UserID (e.g., U12345) or type 'random' for a random user: ").strip()

        if user_input.lower() == 'random':
            user_id = df_interactions['UserID'].sample(1).iloc[0]
            print(f"Selected random UserID: {user_id}")
            break
        elif user_input in unique_user_ids:
            user_id = user_input
            break
        else:
            print("Invalid UserID. Please try again or use 'random'.")

    # 2. Get Number of Recommendations
    while True:
        try:
            n_recs = int(input("How many recommendations do you want (e.g., 5)? "))
            if n_recs > 0:
                break
            else:
                print("Please enter a positive number.")
        except ValueError:
            print("Invalid input. Please enter a whole number.")

    # 3. Generate Recommendations
    print(f"\n⏳ Generating {n_recs} recommendations for User {user_id}...")
    recommendation_df = recommend_news(model, user_id, df_interactions, df_news_features_cand, df_news_all, n=n_recs)

    # 4. Display Results
    print("\n" + "="*50)
    print(f"⭐ Top {len(recommendation_df)} Recommendations for User {user_id} ⭐")
    print("="*50)

    if recommendation_df.empty:
        print("Could not generate any unique recommendations for this user.")
    else:
        # Re-index for ranking display
        recommendation_df.index = np.arange(1, len(recommendation_df) + 1)
        # Format the output table
        output_df = recommendation_df[['Title', 'Category', 'score']].copy()
        output_df.columns = ['Title', 'Category', 'Recommendation Score'] # Use Score instead of Probability

        # Display as a formatted table
        print(output_df.to_markdown(numalign="left", stralign="left", floatfmt=".4f"))

    print("\n" + "="*50)

# Run the interactive UI
# Use the stable LinearSVC model
run_recommendation_ui(svm_model, df_final, df_news_features_cand, df_news_all)

# --- Optional Cleanup ---
# os.remove(PROCESSED_FILE)

✅ Preprocessing and Feature Generation Complete.
⏳ Training Linear Support Vector Machine (LinearSVC) Model...
✅ Model Training Complete.

--- Model Evaluation (Click Prediction) ---
Precision Score: 0.0275
Recall Score:    0.4528
F1 Score:        0.0518

📰 Interactive News Recommendation System (LinearSVC) 📰
Available Users: 2623
Example User IDs: U73700, U8125, U8355, U89744, U10045...

Enter a UserID (e.g., U12345) or type 'random' for a random user: U8355
How many recommendations do you want (e.g., 5)? 5

⏳ Generating 5 recommendations for User U8355...

⭐ Top 5 Recommendations for User U8355 ⭐
|    | Title                                                    | Category   | Recommendation Score   |
|:---|:---------------------------------------------------------|:-----------|:-----------------------|
| 1  | Who Wore It Better? 10 Names Shared by Automakers        | autos      | 1.4416                 |
| 2  | Amazon Is Having Black Friday-Level Deals Right Now      | lifestyle  | 1.3