In [None]:
#### ML Model creation for Classification tasks ###
# Works with LogisticRegression, Randomforest and XGBoost
# Sriram Parthasarathy
# LICENSES : MIT

'''
Classification assigns data points to predefined categories (discrete labels) based on input features.
When to use: Use when the output is categorical (e.g., yes/no, spam/not spam, disease type).
Examples:

- Email spam detection (spam vs. legitimate).

- Medical diagnosis (benign vs. malignant tumor).

- Sentiment analysis (positive/negative/neutral reviews).

Additional Reading:
Please refer to my articles on Medium for more details:
The Shopping Cart Abandonment Problem: How Machine Learning Can Help!
https://medium.com/managing-digital-products/the-shopping-cart-abandonment-problem-how-machine-learning-can-help-eb690f1dc4f6?source=your_stories_page--------------------------------------------

How to Measure & Optimise Your Predictive Model for Prime Time?
https://medium.com/managing-digital-products/how-to-measure-optimise-your-predictive-model-for-prime-time-3b9f6072f85c?source=your_stories_page--------------------------------------------

Increasing The Accuracy of Predictive Models with Stacked Ensemble Techniques: Healthcare Example
https://medium.com/managing-digital-products/increasing-the-accuracy-of-predictive-model-with-stacked-ensemble-techniques-a-healthcare-example-135d36b9a2b7?source=your_stories_page--------------------------------------------

AI Powered Automatic Classification: The Challenges in Managing Data in Clinical Trials
https://medium.com/managing-digital-products/ai-powered-automatic-classification-the-challenges-in-managing-data-in-clinical-trials-6639e7aa1a7d?source=your_stories_page--------------------------------------------

How Do You Measure If Your Customer Churn Predictive Model Is Good?
https://medium.com/data-science/how-do-you-measure-if-your-customer-churn-predictive-model-is-good-187a49a9eee3?source=your_stories_page--------------------------------------------


Practical Data Augmentation Techniques for Predictive Models
https://medium.com/hackernoon/practical-data-augmentation-techniques-for-predictive-models-b51599253c30?source=your_stories_page--------------------------------------------

Machine Learning for Product Managers: Defining the business problem
https://medium.com/managing-digital-products/machine-learning-for-product-managers-defining-the-business-problem-f0e968d09ee7?source=your_stories_page--------------------------------------------

'''

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# ----------------------------
# STEP 1: Create sample dataset
# ----------------------------
# To illustrate I am using sample dataset as I can't share a real customer dataset
# Replace this with your data

data = {
    'education': ['Bachelors', 'Masters', 'PhD', 'Bachelors', 'Masters'],
    'years_experience': [1, 3, 5, 2, 7],
    'salary': [30000.0, 50000.0, 70000.0, 35000.0, 90000.0],
    'performance_label': ['Low', 'Medium', 'High', 'Low', 'High']  # Categorical target
}
df = pd.DataFrame(data)

# ----------------------------
# STEP 2: Define features and target
# ----------------------------
X = df.drop(columns=['performance_label'])
y = df['performance_label']

# ----------------------------
# STEP 3: Preprocessing pipeline
# ----------------------------
categorical_cols = ['education']
numeric_cols = ['years_experience', 'salary']

# Proprocess the data for categorical and numerical columns
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

# For RandomForest and XG Boost we can potentially skip the numeric scaler

# Apply OneHotEncoder to categoricals and pass through numericals for Randomforest
# preprocessor = ColumnTransformer([
#     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
# ], remainder='passthrough')  # keep numeric columns as-is





# ----------------------------
# STEP 4: Build and train model
# ----------------------------
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])



#RandomForestClassifier
# pipeline = Pipeline([
#     ('preprocess', preprocessor),
#     ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
# ])

#XGBClassifier
# pipeline = Pipeline([
#     ('preprocess', preprocessor),
#     ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
# ])


# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# Accuracy
# What it means: % of correct predictions.
# Good when: Classes are balanced.


# Precision
# What it means: How many predicted positives are actually correct.
# Good when: False positives are costly.
# Example: In cancer detection, you don’t want to falsely say someone has cancer.


# Recall (Sensitivity)
# What it means: How many actual positives the model found.
# Good when: False negatives are costly.
# Example: You don’t want to miss a real cancer case.


# F1 Score
# What it means: Balance between Precision and Recall.
# Good when: You need a balance between FP and FN.


# ROC-AUC (Receiver Operating Characteristic – Area Under Curve)
# What it means: How well the model separates classes at different thresholds.
# Good when: You want to understand the model’s overall ability to discriminate.


Accuracy: 0.0

Classification Report:
               precision    recall  f1-score   support

         Low       0.00      0.00      0.00       0.0
      Medium       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
