In [None]:
# %% [markdown]
# # Part 1: BBC News Classification Using Matrix Factorization
#
# **Files:**
# - bbc_news_train.csv (1490 records: ArticleId, Article, Category)
# - bbc_news_test.csv (736 records: ArticleId, Text)
# - bbc_news_sample_solution.csv (sample submission format)
#
# **Objective:**
# - Build a system to classify unseen news articles.
# - Use matrix factorization (NMF) as an unsupervised step to extract latent topics from text and then classify.
# - Compare with a supervised approach (direct TF-IDF).
# - Experiment with hyperparameters and data efficiency.
# - Generate a submission file with predictions.

# %% [code]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
# %% [markdown]
# ## 1. Data Loading and Train-Validation Split
#
# Load the training data from **bbc_news_train.csv**.
# The file contains: ArticleId, Article, Category.
# For internal evaluation, we split into training and validation sets.

# %% [code]
# Load training data
bbc_train = pd.read_csv('.data/bbc_news_train.csv')
print("Training data shape:", bbc_train.shape)
print(bbc_train.head())

# Stratified split (80% train, 20% validation)
train_df, val_df = train_test_split(bbc_train, test_size=0.2, random_state=42, stratify=bbc_train['Category'])
print("Train set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)

# %% [markdown]
# ## 2. Exploratory Data Analysis (EDA)
#
# Visualize the category distribution in the training set.

# %% [code]
plt.figure(figsize=(8, 5))
sns.countplot(x='Category', data=train_df, order=train_df['Category'].value_counts().index)
plt.title("Category Distribution in Training Data")
plt.xlabel("Category")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
# %% [markdown]
# ## 3. Feature Extraction with TF-IDF
#
# **Note:** Fit the vectorizer on the training data (to avoid data leakage) and transform the validation set.
# The text column in training data is "Article".

# %% [code]
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = tfidf_vectorizer.fit_transform(train_df['Article'])
X_val = tfidf_vectorizer.transform(val_df['Article'])

print("TF-IDF Train shape:", X_train.shape)
print("TF-IDF Validation shape:", X_val.shape)


In [None]:
# %% [markdown]
# ## 4. Unsupervised Approach: NMF + Logistic Regression
#
# For various numbers of latent topics (n_components), we:
# - Fit NMF on the training TF-IDF features.
# - Transform both training and validation data.
# - Train Logistic Regression on the NMF features.
# - Record training and validation accuracies.

# %% [code]
n_components_list = [5, 10, 15, 20]
nmf_results = []

for n in n_components_list:
    nmf = NMF(n_components=n, random_state=42, init='nndsvda', max_iter=300)
    W_train = nmf.fit_transform(X_train)
    W_val = nmf.transform(X_val)

    clf = LogisticRegression(max_iter=500)
    clf.fit(W_train, train_df['Category'])

    train_pred = clf.predict(W_train)
    val_pred = clf.predict(W_val)

    train_acc = accuracy_score(train_df['Category'], train_pred)
    val_acc = accuracy_score(val_df['Category'], val_pred)

    nmf_results.append({'n_components': n, 'Train Accuracy': train_acc, 'Validation Accuracy': val_acc})

nmf_results_df = pd.DataFrame(nmf_results)
print("NMF Hyperparameter Results:")
print(nmf_results_df)


In [None]:
plt.figure(figsize=(8,5))
plt.plot(nmf_results_df['n_components'], nmf_results_df['Train Accuracy'], marker='o', label='Train Accuracy')
plt.plot(nmf_results_df['n_components'], nmf_results_df['Validation Accuracy'], marker='o', label='Validation Accuracy')
plt.xlabel('Number of NMF Components')
plt.ylabel('Accuracy')
plt.title('NMF Hyperparameter Tuning')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# %% [markdown]
# ### Detailed Classification Report using NMF Features (n_components = 10)
#
# We choose n_components = 10 and display full classification reports.

# %% [code]
nmf_final = NMF(n_components=10, random_state=42, init='nndsvda', max_iter=300)
W_train_final = nmf_final.fit_transform(X_train)
W_val_final = nmf_final.transform(X_val)

clf_nmf_final = LogisticRegression(max_iter=500)
clf_nmf_final.fit(W_train_final, train_df['Category'])

train_pred_final = clf_nmf_final.predict(W_train_final)
val_pred_final = clf_nmf_final.predict(W_val_final)

print("Classification Report (NMF features) - Train:")
print(classification_report(train_df['Category'], train_pred_final))
print("Classification Report (NMF features) - Validation:")
print(classification_report(val_df['Category'], val_pred_final))


In [None]:
# %% [markdown]
# ## 5. Supervised Approach: TF-IDF + Logistic Regression
#
# Here we train a classifier directly on the TF-IDF features.

# %% [code]
clf_tfidf = LogisticRegression(max_iter=500)
clf_tfidf.fit(X_train, train_df['Category'])

train_pred_tfidf = clf_tfidf.predict(X_train)
val_pred_tfidf = clf_tfidf.predict(X_val)

print("Classification Report (TF-IDF) - Train:")
print(classification_report(train_df['Category'], train_pred_tfidf))
print("Classification Report (TF-IDF) - Validation:")
print(classification_report(val_df['Category'], val_pred_tfidf))


In [None]:
# %% [markdown]
# ## 6. Data Efficiency Analysis
#
# We evaluate performance when training on only a fraction of the training data (10%, 20%, 50%, 100%).
# Both the unsupervised (NMF+LR) and supervised (TF-IDF+LR) approaches are compared.

# %% [code]
fractions = [0.1, 0.2, 0.5, 1.0]
efficiency_results = []

for frac in fractions:
    sub_train = train_df.sample(frac=frac, random_state=42)

    tfidf_sub = TfidfVectorizer(stop_words='english', max_features=1000)
    X_sub = tfidf_sub.fit_transform(sub_train['Article'])
    X_val_sub = tfidf_sub.transform(val_df['Article'])

    # Unsupervised approach
    nmf_sub = NMF(n_components=10, random_state=42, init='nndsvda', max_iter=300)
    W_sub = nmf_sub.fit_transform(X_sub)
    W_val_sub = nmf_sub.transform(X_val_sub)

    clf_nmf_sub = LogisticRegression(max_iter=500)
    clf_nmf_sub.fit(W_sub, sub_train['Category'])
    val_acc_nmf = accuracy_score(val_df['Category'], clf_nmf_sub.predict(W_val_sub))

    # Supervised approach
    clf_tfidf_sub = LogisticRegression(max_iter=500)
    clf_tfidf_sub.fit(X_sub, sub_train['Category'])
    val_acc_tfidf = accuracy_score(val_df['Category'], clf_tfidf_sub.predict(X_val_sub))

    efficiency_results.append({
        'Fraction': frac,
        'Unsupervised (NMF) Acc': val_acc_nmf,
        'Supervised (TF-IDF) Acc': val_acc_tfidf
    })

efficiency_df = pd.DataFrame(efficiency_results)
print("Data Efficiency Analysis:")
print(efficiency_df)

plt.figure(figsize=(8,5))
plt.plot(efficiency_df['Fraction'], efficiency_df['Unsupervised (NMF) Acc'], marker='o', label='Unsupervised (NMF)')
plt.plot(efficiency_df['Fraction'], efficiency_df['Supervised (TF-IDF) Acc'], marker='o', label='Supervised (TF-IDF)')
plt.xlabel('Fraction of Training Data')
plt.ylabel('Validation Accuracy')
plt.title('Data Efficiency Analysis')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# %% [markdown]
# ## 7. Final Test Set Prediction and Submission File
#
# Now load **bbc_news_test.csv** (columns: ArticleId and Text), predict the Category using the chosen model, and create a submission file.

# %% [code]
# Load test data
bbc_test = pd.read_csv('week4_classification/data/bbc_news_test.csv')
print("Test data shape:", bbc_test.shape)
print(bbc_test.head())


In [None]:
# Use the supervised TF-IDF classifier for final prediction.
X_test_final = tfidf_vectorizer.transform(bbc_test['Text'])
test_predictions = clf_tfidf.predict(X_test_final)

submission = pd.DataFrame({
    'ArticleId': bbc_test['ArticleId'],
    'Category': test_predictions
})
print("Submission preview:")
print(submission.head())


In [None]:
# Save submission file (e.g., bbc_news_submission.csv)
submission.to_csv('bbc_news_submission.csv', index=False)
print("Submission file 'bbc_news_submission.csv' created.")


In [None]:
# %% [markdown]
# # Part 2: Movie Ratings Prediction with sklearn’s NMF
#
# **Files:**
# - **train.csv:** Contains movie ratings training data with columns: uID, mID, rating
# - **test.csv:** Contains movie ratings test data with columns: uID, mID, rating
# - **movies.csv:** Contains movie details (mID, title, year, etc.)
# - **users.csv:** Contains user information (uID, gender, age, occupation, zip)
#
# **Objective:**
# - Build a recommender system using matrix factorization (NMF) to predict ratings.
# - Create a user-item matrix from the training data.
# - Use sklearn’s NMF to predict missing ratings for the test set pairs.
# - Compute RMSE on the test ratings.
# - Compare against a simple baseline (global mean).
# - Discuss limitations and potential improvements.

# %% [markdown]
# ## 1. Load the Movie Ratings Data and Build User-Item Matrix

# %% [code]
# Load the four files (adjust paths as necessary)
ratings_train = pd.read_csv('./movie_ratings/train.csv')   # Columns: uID, mID, rating
ratings_test  = pd.read_csv('./movie_ratings/test.csv')      # Columns: uID, mID, rating
movies = pd.read_csv('./movie_ratings/movies.csv')           # Contains mID, title, year, etc.
users  = pd.read_csv('./movie_ratings/users.csv')            # Contains uID, gender, age, occupation, zip

print("Ratings Train shape:", ratings_train.shape)
print("Ratings Test shape:", ratings_test.shape)
print("Movies shape:", movies.shape)
print("Users shape:", users.shape)

# Create a user-item matrix from the training ratings.
# Rows: unique uID; Columns: unique mID; Values: rating
user_item_train = ratings_train.pivot(index='uID', columns='mID', values='rating')
print("User-Item Matrix shape:", user_item_train.shape)

# %% [markdown]
# ## 2. Prepare the Training Matrix for NMF
#
# Fill missing ratings with 0. (Note: 0 is not an actual rating but is used for factorization.)

# %% [code]
train_matrix = user_item_train.fillna(0)

# %% [markdown]
# ## 3. Apply NMF and Predict Ratings on the Test Set
#
# We experiment with different numbers of latent factors (n_components).
# For each setting, we factorize the training matrix, reconstruct it, and then for each (uID, mID) in the test set, we predict the rating.
# Finally, we compute the RMSE.

# %% [code]
from sklearn.metrics import mean_squared_error

n_components_list_ratings = [10, 20, 30, 40]
nmf_results_ratings = []

for n in n_components_list_ratings:
    nmf_model = NMF(n_components=n, init='nndsvda', random_state=42, max_iter=300)
    W = nmf_model.fit_transform(train_matrix)
    H = nmf_model.components_

    reconstructed = np.dot(W, H)

    test_actual = []
    test_pred = []
    for index, row in ratings_test.iterrows():
        user = row['uID']
        movie = row['mID']
        actual_rating = row['rating']
        # Get indices in the user-item matrix
        if user in train_matrix.index and movie in train_matrix.columns:
            u_idx = train_matrix.index.get_loc(user)
            m_idx = train_matrix.columns.get_loc(movie)
            pred_rating = reconstructed[u_idx, m_idx]
            test_actual.append(actual_rating)
            test_pred.append(pred_rating)

    rmse = np.sqrt(mean_squared_error(test_actual, test_pred))
    nmf_results_ratings.append({'n_components': n, 'RMSE': rmse})
    print(f"n_components = {n} -> RMSE: {rmse:.4f}")

nmf_results_ratings_df = pd.DataFrame(nmf_results_ratings)
print("\nNMF RMSE Results for Movie Ratings:")
print(nmf_results_ratings_df)

plt.figure(figsize=(8,5))
plt.plot(nmf_results_ratings_df['n_components'], nmf_results_ratings_df['RMSE'], marker='o')
plt.xlabel('Number of NMF Components')
plt.ylabel('RMSE')
plt.title('NMF Hyperparameter Tuning for Movie Ratings Prediction')
plt.grid(True)
plt.show()


In [None]:
# %% [markdown]
# ## 4. Baseline Predictor: Global Mean Rating
#
# We compute the global mean rating from the training data (ignoring zeros) and use it as a prediction for all test entries.
# Then compute RMSE for this baseline.

# %% [code]
# Compute global mean from actual (nonzero) ratings in the training set
nonzero_ratings = train_matrix[train_matrix != 0].stack()
global_mean = nonzero_ratings.mean()
print("Global Mean Rating:", global_mean)

baseline_actual = []
baseline_pred = []
for index, row in ratings_test.iterrows():
    user = row['uID']
    movie = row['mID']
    actual_rating = row['rating']
    baseline_actual.append(actual_rating)
    baseline_pred.append(global_mean)

baseline_rmse = np.sqrt(mean_squared_error(baseline_actual, baseline_pred))
print(f"Baseline Predictor RMSE: {baseline_rmse:.4f}")


In [None]:
# %% [markdown]
# ## 5. Discussion for Part 2
#
# **Observations:**
# - The RMSE of the NMF model varies with the number of latent factors. In some settings, the improvement over the simple global mean baseline is small.
#
# **Limitations of sklearn’s NMF:**
# - **Sensitivity to Initialization:** Despite using NNDSVD, the model may converge to a suboptimal local minimum.
# - **Handling Missing Data:** Filling missing ratings with 0 is not ideal since 0 can be misinterpreted as an actual rating, potentially biasing the factorization.
# - **Sparsity Modeling:** The factorization does not explicitly model the sparse nature of the ratings matrix.
#
# **Potential Improvements:**
# - Use multiple random restarts or more advanced initialization methods to improve convergence.
# - Apply imputation techniques or use methods that inherently model missing data (e.g., probabilistic matrix factorization).
# - Explore hybrid approaches that combine matrix factorization with baseline or similarity-based predictors.
#
# These improvements could lead to better performance in practice.

# %% [markdown]
# # End of Notebook
