In [2]:
# Import necessary libraries
import wandb
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, 
    mean_squared_error
)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
import matplotlib.pyplot as plt

df_train = pd.read_csv('./dataset/train.csv')  # Change this to the correct local path
df_test = pd.read_csv('./dataset/test.csv')


# Initialize Weights & Biases
wandb.init(
    project="OBL-MACHINE-LEARNING-2024",
    name="DecisionTree_Classification"
)

# Data Cleaning Function
def clean_data(df):
    df['Year'] = df['Year'].fillna(df['Year'].median())  # Fill 'Year' with median
    df['Publisher'] = df['Publisher'].fillna('Unknown')  # Fill 'Publisher' with 'Unknown'
    if 'Summary' in df.columns:
        df = df.drop(columns=['Summary'])  # Drop 'Summary' if it exists
    return df

# Data Preprocessing Function
def preprocess_data(df, is_train=True):
    df = clean_data(df)
    
    # Convert columns with 'K' to numeric
    def convert_to_numeric(value):
        value_str = str(value)
        if 'K' in value_str:
            return int(float(value_str.replace('K', '')) * 1000)
        return int(float(value_str))
    
    columns_to_convert = [
        'Europe', 'Japan', 'Rest of World', 'North America', 
        'Global', 'Number of Reviews', 'Wishlist'
    ]
    for col in columns_to_convert:
        if col in df.columns:
            df[col] = df[col].apply(convert_to_numeric)
    
    # Label Encoding for categorical columns
    label_encoder = LabelEncoder()
    categorical_columns = ['Game Title', 'Publisher', 'Platform', 'Genre']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = label_encoder.fit_transform(df[col])
    
    # Split features and target for training
    if is_train:
        X = df.drop(columns=['id', 'Rating'], errors='ignore')
        y = df['Rating'] if 'Rating' in df.columns else None
        return X, y
    else:
        return df.drop(columns=['id', 'Rating'], errors='ignore')

# Find the Best Tree Depth
def find_best_depth(X, y, max_depth=20, cv=5, random_state=42):
    depth_range = range(1, max_depth + 1)
    mean_scores = []
    
    for depth in depth_range:
        model = DecisionTreeClassifier(max_depth=depth, random_state=random_state)
        scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
        mean_scores.append(np.mean(scores))
    
    best_depth = depth_range[np.argmax(mean_scores)]
    best_score = max(mean_scores)
    print(f"Best Depth: {best_depth}, Best Accuracy: {best_score:.4f}")
    
    wandb.log({"best_depth": best_depth, "best_cv_accuracy": best_score})
    return best_depth, best_score

# Train and Evaluate Model
def train_and_evaluate_model(X, y, best_depth, test_size=0.33, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    model = DecisionTreeClassifier(max_depth=best_depth, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print(f"Metrics:\n  Accuracy: {accuracy:.4f}\n  F1 Score: {f1:.4f}\n"
          f"  Precision: {precision:.4f}\n  Recall: {recall:.4f}")
    
    # Log metrics to wandb
    wandb.log({
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    })
    
    # Plot Decision Tree
    plt.figure(figsize=(15, 10))
    tree.plot_tree(model, filled=True, feature_names=X.columns, class_names=True, fontsize=10)
    plt.title(f"Decision Tree (Depth = {best_depth})")
    plt.show()
    
    return model

# Workflow Function
def run_workflow(df_train, df_test):
    # Preprocess data
    X_train, y_train = preprocess_data(df_train, is_train=True)
    X_test = preprocess_data(df_test, is_train=False)
    
    # Find best depth
    best_depth, _ = find_best_depth(X_train, y_train)
    
    # Train and evaluate model
    best_model = train_and_evaluate_model(X_train, y_train, best_depth)
    
    # Predict on test data
    predictions = best_model.predict(X_test)
    
    # Create submission
    submission = pd.DataFrame({
        'id': df_test['id'],
        'Prediction': predictions
    })
    submission.to_csv('submission_with_metrics.csv', index=False)
    print("Submission saved to 'submission_with_metrics.csv'.")
    
    return best_model

# Load Data (Assumes df_train and df_test are pre-loaded as DataFrames)
# df_train = pd.read_csv('train.csv')
# df_test = pd.read_csv('test.csv')

# Run Workflow
# model = run_workflow(df_train, df_test)


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))