# Module 01: Model Development

## Necessary Libraries and Modules

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report

## Data Import and K-fold Split

In [17]:
# Load data
data = pd.read_csv("data.csv")

# Prepare data
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable

# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

## Classification Models

### RFC

In [5]:
# Initialize model
model = RandomForestClassifier(n_estimators=500, bootstrap=True, random_state=42)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Fold {fold + 1}:")
    print(f"Model Accuracy: {accuracy:.3f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")


Fold 1:
Model Accuracy: 0.787
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.72      0.76        29
           1       0.77      0.84      0.81        32

    accuracy                           0.79        61
   macro avg       0.79      0.78      0.78        61
weighted avg       0.79      0.79      0.79        61



Fold 2:
Model Accuracy: 0.803
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.83      0.76        23
           1       0.88      0.79      0.83        38

    accuracy                           0.80        61
   macro avg       0.79      0.81      0.80        61
weighted avg       0.81      0.80      0.81        61



Fold 3:
Model Accuracy: 0.787
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.67      0.75        30
           1       0.74      0.90      0.81        31

    accuracy       

### XGBC

In [6]:
# Initialize model
model = XGBClassifier(
        objective='binary:logistic',
        max_depth=5,  # Increased depth to capture more complex patterns
        learning_rate=0.05,  # Lower learning rate for better performance
        n_estimators=200,  # Increased number of trees
        subsample=0.8,  # Introduced randomness to prevent overfitting
        colsample_bytree=0.8,
    tree_method='hist'
    )
# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Fold {fold + 1}:")
    print(f"Model Accuracy: {accuracy:.3f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")

Fold 1:
Model Accuracy: 0.803
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.79      0.79        29
           1       0.81      0.81      0.81        32

    accuracy                           0.80        61
   macro avg       0.80      0.80      0.80        61
weighted avg       0.80      0.80      0.80        61



Fold 2:
Model Accuracy: 0.787
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.78      0.73        23
           1       0.86      0.79      0.82        38

    accuracy                           0.79        61
   macro avg       0.77      0.79      0.78        61
weighted avg       0.79      0.79      0.79        61



Fold 3:
Model Accuracy: 0.754
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.63      0.72        30
           1       0.71      0.87      0.78        31

    accuracy       

### DTC

In [10]:
# Initialize model
model = DecisionTreeClassifier(random_state=42)

# Store results
accuracies = []

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train and predict
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    fold_accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(fold_accuracy)
    
    # Print fold results
    print(f"Fold {fold + 1}:")
    print(f"Accuracy: {fold_accuracy:.3f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 50 + "\n")

# Print final summary
print(f"Average Accuracy: {sum(accuracies)/len(accuracies):.3f}")


Fold 1:
Accuracy: 0.705
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.79      0.72        29
           1       0.77      0.62      0.69        32

    accuracy                           0.70        61
   macro avg       0.71      0.71      0.70        61
weighted avg       0.72      0.70      0.70        61

--------------------------------------------------

Fold 2:
Accuracy: 0.754
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.78      0.71        23
           1       0.85      0.74      0.79        38

    accuracy                           0.75        61
   macro avg       0.75      0.76      0.75        61
weighted avg       0.77      0.75      0.76        61

--------------------------------------------------

Fold 3:
Accuracy: 0.738
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.70      0.72   

### KNNC

In [13]:
# Initialize model and scaler
model = KNeighborsClassifier(n_neighbors=5)
scaler = StandardScaler()

# Store results
accuracies = []

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Scale features
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    
    print(f"Fold {fold + 1}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n" + "="*50 + "\n")

# Calculate average accuracy
print(f"Average Accuracy Across {kf.get_n_splits()} Folds: {sum(accuracies)/len(accuracies):.4f}")


Fold 1:
Accuracy: 0.8525
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.83      0.84        29
           1       0.85      0.88      0.86        32

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61



Fold 2:
Accuracy: 0.7213
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.65      0.64        23
           1       0.78      0.76      0.77        38

    accuracy                           0.72        61
   macro avg       0.70      0.71      0.71        61
weighted avg       0.72      0.72      0.72        61



Fold 3:
Accuracy: 0.7705
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.67      0.74        30
           1       0.73      0.87      0.79        31

    accuracy                      

### LRC

In [15]:
# Initialize logistic regression model
model = LogisticRegression(solver='liblinear', 
                          max_iter=1000, 
                          random_state=42)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Fold {fold + 1}:")
    print(f"Model Accuracy: {accuracy:.3f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n" + "="*50 + "\n")


Fold 1:
Model Accuracy: 0.885
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61



Fold 2:
Model Accuracy: 0.770
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.78      0.72        23
           1       0.85      0.76      0.81        38

    accuracy                           0.77        61
   macro avg       0.76      0.77      0.76        61
weighted avg       0.78      0.77      0.77        61



Fold 3:
Model Accuracy: 0.770
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.63      0.73        30
           1       0.72      0.90      0.80        31

    accuracy       