imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_validate

In [2]:

def preprocess(file_path,target_column_index,has_header,delete_list,extraction_list):
    """this function is used for taking the dataset from .csv file making a test train split
    and x, y split (has header is either True or False)"""

    #reading the .csv
    if has_header:
        df = pd.read_csv(file_path, header=0, delimiter=",")  # First row as header
        print(f"Dataset shape: {df.shape}")
        print(f"Column names: {list(df.columns)}")
    else:
        df = pd.read_csv(file_path, header=None, delimiter=",")  # No header row
        print(f"Dataset shape: {df.shape}")

    # Find all string/object columns automatically
    string_columns = df.select_dtypes(include=['object']).columns
    print(string_columns)
    # Encode all string columns
    for column in string_columns:
        encoder = LabelEncoder()
        df[column] = encoder.fit_transform(df[column])

    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

    #making target variable split
    x_first = df_shuffled.drop(df.columns[target_column_index], axis=1)  # All except target

    x_ds = x_first.drop(columns = delete_list + extraction_list)
    y_ds = df_shuffled.iloc[:, target_column_index]  # Target column only
    

    print(f'shape of x: {x_ds.shape}')

    return x_ds, y_ds

In [3]:
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

def implement(model, x_ds, y_ds):
    from sklearn.model_selection import StratifiedKFold, cross_validate
    skf = StratifiedKFold(n_splits=10)
    scoring = ['balanced_accuracy', 'accuracy', 'precision_weighted', 'recall_weighted']
    cv_results = cross_validate(model, x_ds, y_ds, cv=skf, scoring=scoring)
    return cv_results 

def compute_model_differences(model_name, globals_dict):
    """
    Creates variables like knn_0_precision, knn_1_precision, etc.
    Each value formatted as: '+0.011' or '-0.002' or '0'
    """
    metrics = {
        "bacc": "balanced_accuracy",
        "precision": "precision_weighted",
        "recall": "recall_weighted"
    }
    
    for i in range(0, 8, 2):  # pairs: (0,1), (2,3), (4,5), (6,7)
        idx = i // 2
        cv_a = globals_dict[f"cv_results{i+1}"]
        cv_b = globals_dict[f"cv_results{i+2}"]
        
        for short, metric in metrics.items():
            var_name = f"{model_name}_{idx}_{short}"
            mean_a = cv_a[f"test_{metric}"].mean()
            mean_b = cv_b[f"test_{metric}"].mean()
            diff = round(mean_b - mean_a, 3)
            
            if diff == 0:
                formatted = "0"
            else:
                sign = "+" if diff > 0 else ""
                formatted = f"{sign}{diff:.3f}"
            
            globals_dict[var_name] = formatted

def create_comparison_table(model_names, globals_dict, output_file="model_comparison.docx"):
    """
    Creates a Word table with models as columns and metrics as rows.
    
    Args:
        model_names: List of model names (e.g., ['knn', 'svm', 'rf', 'dt', 'nb', 'lr'])
        globals_dict: Dictionary containing all the model metric variables
        output_file: Output Word document filename
    """
    doc = Document()
    doc.add_heading('Model Performance Comparison', 0)
    
    # Calculate number of rows: 4 datasets Ã— 2 metrics (precision, recall) = 8 rows + 1 header
    num_datasets = 4
    num_rows = num_datasets * 2 + 1  # +1 for header
    num_cols = len(model_names) + 1  # +1 for metric names column
    
    # Create table
    table = doc.add_table(rows=num_rows, cols=num_cols)
    table.style = 'Light Grid Accent 1'
    
    # Header row
    header_cells = table.rows[0].cells
    header_cells[0].text = 'Metric'
    for col_idx, model_name in enumerate(model_names):
        header_cells[col_idx + 1].text = model_name.upper()
        header_cells[col_idx + 1].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    # Fill in the rows
    row_idx = 1
    for dataset_idx in range(num_datasets):
        # Precision row
        table.rows[row_idx].cells[0].text = f'Dataset {dataset_idx} - Precision'
        for col_idx, model_name in enumerate(model_names):
            var_name = f"{model_name}_{dataset_idx}_precision"
            value = globals_dict.get(var_name, "N/A")
            table.rows[row_idx].cells[col_idx + 1].text = str(value)
            table.rows[row_idx].cells[col_idx + 1].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
        row_idx += 1
        
        # Recall row
        table.rows[row_idx].cells[0].text = f'Dataset {dataset_idx} - Recall'
        for col_idx, model_name in enumerate(model_names):
            var_name = f"{model_name}_{dataset_idx}_recall"
            value = globals_dict.get(var_name, "N/A")
            table.rows[row_idx].cells[col_idx + 1].text = str(value)
            table.rows[row_idx].cells[col_idx + 1].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
        row_idx += 1
    
    # Save document
    doc.save(output_file)
    print(f"Table saved to {output_file}")



Select the dataset!

Runs the preprocess for diffrent datasets.

In [4]:
x_ds1, y_ds1 = preprocess("beans_kmeans.csv", 16, True, [], [])

Dataset shape: (13611, 17)
Column names: ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4', 'Class']
Index(['Class'], dtype='object')
shape of x: (13611, 16)


In [5]:
x_ds2, y_ds2 = preprocess("beans_kmeans.csv", 16, True, [],["ShapeFactor2","Compactness","ShapeFactor3","roundness", "MajorAxisLength"] )

Dataset shape: (13611, 17)
Column names: ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4', 'Class']
Index(['Class'], dtype='object')
shape of x: (13611, 11)


x_ds, y_ds = preprocess("diabetes_kmeans.csv", 8, True, [], [])

x_ds, y_ds = preprocess("diabetes_kmeans.csv", 8, True, [], [])

In [6]:
x_ds3, y_ds3 = preprocess("divorce.csv", 54, True, [], [])

Dataset shape: (170, 55)
Column names: ['Atr1', 'Atr2', 'Atr3', 'Atr4', 'Atr5', 'Atr6', 'Atr7', 'Atr8', 'Atr9', 'Atr10', 'Atr11', 'Atr12', 'Atr13', 'Atr14', 'Atr15', 'Atr16', 'Atr17', 'Atr18', 'Atr19', 'Atr20', 'Atr21', 'Atr22', 'Atr23', 'Atr24', 'Atr25', 'Atr26', 'Atr27', 'Atr28', 'Atr29', 'Atr30', 'Atr31', 'Atr32', 'Atr33', 'Atr34', 'Atr35', 'Atr36', 'Atr37', 'Atr38', 'Atr39', 'Atr40', 'Atr41', 'Atr42', 'Atr43', 'Atr44', 'Atr45', 'Atr46', 'Atr47', 'Atr48', 'Atr49', 'Atr50', 'Atr51', 'Atr52', 'Atr53', 'Atr54', 'Class']
Index([], dtype='object')
shape of x: (170, 54)


In [7]:
x_ds4, y_ds4 = preprocess("divorce.csv", 54, True, [], ['Atr29','Atr8','Atr5','Atr22','Atr38','Atr40','Atr23','Atr52','Atr54','Atr32','Atr33','Atr34','Atr35','Atr36','Atr37'])

Dataset shape: (170, 55)
Column names: ['Atr1', 'Atr2', 'Atr3', 'Atr4', 'Atr5', 'Atr6', 'Atr7', 'Atr8', 'Atr9', 'Atr10', 'Atr11', 'Atr12', 'Atr13', 'Atr14', 'Atr15', 'Atr16', 'Atr17', 'Atr18', 'Atr19', 'Atr20', 'Atr21', 'Atr22', 'Atr23', 'Atr24', 'Atr25', 'Atr26', 'Atr27', 'Atr28', 'Atr29', 'Atr30', 'Atr31', 'Atr32', 'Atr33', 'Atr34', 'Atr35', 'Atr36', 'Atr37', 'Atr38', 'Atr39', 'Atr40', 'Atr41', 'Atr42', 'Atr43', 'Atr44', 'Atr45', 'Atr46', 'Atr47', 'Atr48', 'Atr49', 'Atr50', 'Atr51', 'Atr52', 'Atr53', 'Atr54', 'Class']
Index([], dtype='object')
shape of x: (170, 39)


In [8]:
x_ds5, y_ds5 = preprocess("parkinsons_kmeans.csv", 17, True, ['name'], [])

Dataset shape: (195, 24)
Column names: ['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']
Index(['name'], dtype='object')
shape of x: (195, 22)


In [9]:
x_ds6, y_ds6 = preprocess("parkinsons_kmeans.csv", 17, True, ['name'], ['spread1','HNR','PPE','MDVP:APQ','NHR','MDVP:Jitter(Abs)'])

Dataset shape: (195, 24)
Column names: ['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']
Index(['name'], dtype='object')
shape of x: (195, 16)


In [10]:
x_ds7, y_ds7 = preprocess("rice_binned_kmeans.csv", 7, True, [], [])

Dataset shape: (3810, 8)
Column names: ['Area', 'Perimeter', 'Major_Axis_Length', 'Minor_Axis_Length', 'Eccentricity', 'Convex_Area', 'Extent', 'Class']
Index(['Class'], dtype='object')
shape of x: (3810, 7)


In [11]:
x_ds8, y_ds8 = preprocess("rice_binned_kmeans.csv", 7, True, [], ['Minor_Axis_Length', 'Extent'])

Dataset shape: (3810, 8)
Column names: ['Area', 'Perimeter', 'Major_Axis_Length', 'Minor_Axis_Length', 'Eccentricity', 'Convex_Area', 'Extent', 'Class']
Index(['Class'], dtype='object')
shape of x: (3810, 5)


x_ds, y_ds = preprocess("wdbc_binned_kmeans.csv", 1, True, ['ID'], [])

x_ds, y_ds = preprocess("wdbc_binned_kmeans.csv", 1, True, ['ID'], [])

Run the implement function for diffrent models

In [12]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
cv_results1 = implement(KNeighborsClassifier(),
                     x_ds1, y_ds1)

cv_results2 = implement(KNeighborsClassifier(),
                     x_ds2, y_ds2)

cv_results3 = implement(KNeighborsClassifier(),
                     x_ds3, y_ds3)

cv_results4 = implement(KNeighborsClassifier(),
                     x_ds4, y_ds4)

cv_results5 = implement(KNeighborsClassifier(),
                     x_ds5, y_ds5)

cv_results6 = implement(KNeighborsClassifier(),
                     x_ds6, y_ds6)

cv_results7 = implement(KNeighborsClassifier(),
                     x_ds7, y_ds7)

cv_results8 = implement(KNeighborsClassifier(),
                     x_ds8, y_ds8)


compute_model_differences("knn", globals())


knn_0_bacc= round(cv_results2['test_balanced_accuracy'].mean() - cv_results1['test_balanced_accuracy'].mean(), 3)
knn_1_bacc= round(cv_results4['test_balanced_accuracy'].mean() - cv_results3['test_balanced_accuracy'].mean(), 3)
knn_2_bacc= round(cv_results6['test_balanced_accuracy'].mean() - cv_results5['test_balanced_accuracy'].mean(), 3)
knn_3_bacc= round(cv_results8['test_balanced_accuracy'].mean() - cv_results7['test_balanced_accuracy'].mean(), 3)

knn_0_precision= round(cv_results2['test_precision_weighted'].mean() - cv_results1['test_precision_weighted'].mean(), 3)
knn_1_precision= round(cv_results4['test_precision_weighted'].mean() - cv_results3['test_precision_weighted'].mean(), 3)
knn_2_precision= round(cv_results6['test_precision_weighted'].mean() - cv_results5['test_precision_weighted'].mean(), 3)
knn_3_precision= round(cv_results8['test_precision_weighted'].mean() - cv_results7['test_precision_weighted'].mean(), 3)

knn_0_recall= round(cv_results2['test_recall_weighted'].mean() - cv_results1['test_recall_weighted'].mean(), 3)
knn_1_recall= round(cv_results4['test_recall_weighted'].mean() - cv_results3['test_recall_weighted'].mean(), 3)
knn_2_recall= round(cv_results6['test_recall_weighted'].mean() - cv_results5['test_recall_weighted'].mean(), 3)
knn_3_recall= round(cv_results8['test_recall_weighted'].mean() - cv_results7['test_recall_weighted'].mean(), 3)

In [13]:
#Ada Boost
from sklearn.ensemble import AdaBoostClassifier
cv_results1 = implement(AdaBoostClassifier(n_estimators=100, random_state=42),
                     x_ds1, y_ds1)

cv_results2 = implement(AdaBoostClassifier(n_estimators=100, random_state=42),
                     x_ds2, y_ds2)

cv_results3 = implement(AdaBoostClassifier(n_estimators=100, random_state=42),
                     x_ds3, y_ds3)

cv_results4 = implement(AdaBoostClassifier(n_estimators=100, random_state=42),
                     x_ds4, y_ds4)

cv_results5 = implement(AdaBoostClassifier(n_estimators=100, random_state=42),
                     x_ds5, y_ds5)

cv_results6 = implement(AdaBoostClassifier(n_estimators=100, random_state=42),
                     x_ds6, y_ds6)

cv_results7 = implement(AdaBoostClassifier(n_estimators=100, random_state=42),
                     x_ds7, y_ds7)

cv_results8 = implement(AdaBoostClassifier(n_estimators=100, random_state=42),
                     x_ds8, y_ds8)

compute_model_differences("ada", globals())

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
#SVM
from sklearn.svm import SVC 
cv_results1 = implement(SVC(max_iter = -1, random_state=42),
                     x_ds1, y_ds1)

cv_results2 = implement(SVC(max_iter = -1, random_state=42),
                     x_ds2, y_ds2)

cv_results3 = implement(SVC(max_iter = -1, random_state=42),
                     x_ds3, y_ds3)

cv_results4 = implement(SVC(max_iter = -1, random_state=42),
                     x_ds4, y_ds4)

cv_results5 = implement(SVC(max_iter = -1, random_state=42),
                     x_ds5, y_ds5)

cv_results6 = implement(SVC(max_iter = -1, random_state=42),
                     x_ds6, y_ds6)

cv_results7 = implement(SVC(max_iter = -1, random_state=42),
                     x_ds7, y_ds7)

cv_results8 = implement(SVC(max_iter = -1, random_state=42),
                     x_ds8, y_ds8)

compute_model_differences("svm", globals())

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [15]:
#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB 

cv_results1 = implement(GaussianNB(),
                     x_ds1, y_ds1)

cv_results2 = implement(GaussianNB(),
                     x_ds2, y_ds2)

cv_results3 = implement(GaussianNB(),
                     x_ds3, y_ds3)

cv_results4 = implement(GaussianNB(),
                     x_ds4, y_ds4)

cv_results5 = implement(GaussianNB(),
                     x_ds5, y_ds5)

cv_results6 = implement(GaussianNB(),
                     x_ds6, y_ds6)

cv_results7 = implement(GaussianNB(),
                     x_ds7, y_ds7)

cv_results8 = implement(GaussianNB(),
                     x_ds8, y_ds8)

compute_model_differences("gnb", globals())

In [16]:
#Random Forests
from sklearn.ensemble import RandomForestClassifier

cv_results1 = implement(RandomForestClassifier(),
                     x_ds1, y_ds1)

cv_results2 = implement(RandomForestClassifier(),
                     x_ds2, y_ds2)

cv_results3 = implement(RandomForestClassifier(),
                     x_ds3, y_ds3)

cv_results4 = implement(RandomForestClassifier(),
                     x_ds4, y_ds4)

cv_results5 = implement(RandomForestClassifier(),
                     x_ds5, y_ds5)

cv_results6 = implement(RandomForestClassifier(),
                     x_ds6, y_ds6)

cv_results7 = implement(RandomForestClassifier(),
                     x_ds7, y_ds7)

cv_results8 = implement(RandomForestClassifier(),
                     x_ds8, y_ds8)

compute_model_differences("rf", globals())

In [17]:
#Decesion Trees
from sklearn.tree import DecisionTreeClassifier


cv_results1 = implement(DecisionTreeClassifier(),
                     x_ds1, y_ds1)

cv_results2 = implement(DecisionTreeClassifier(),
                     x_ds2, y_ds2)

cv_results3 = implement(DecisionTreeClassifier(),
                     x_ds3, y_ds3)

cv_results4 = implement(DecisionTreeClassifier(),
                     x_ds4, y_ds4)

cv_results5 = implement(DecisionTreeClassifier(),
                     x_ds5, y_ds5)

cv_results6 = implement(DecisionTreeClassifier(),
                     x_ds6, y_ds6)

cv_results7 = implement(DecisionTreeClassifier(),
                     x_ds7, y_ds7)

cv_results8 = implement(DecisionTreeClassifier(),
                     x_ds8, y_ds8)

compute_model_differences("dt", globals())

Comparison results for futher use at LaTex table format

In [18]:
model_names = ['knn', 'svm', 'rf', 'dt', 'nb', 'lr']

# Compute differences for each model
for model_name in model_names:
    compute_model_differences(model_name, globals())

# Create the Word table
create_comparison_table(model_names, globals(), "model_comparison.docx")

Table saved to model_comparison.docx
