# **Predictive Modeling for Attorney Involvement in Claims**

## **1. Data Loading and Preprocessing**

#### **Importing Necessary Libraries**

In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from IPython.core.display import display, HTML
import missingno as msno
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler  
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


  from IPython.core.display import display, HTML


In [99]:
# Function to Style Tables using Pandas

# HTML Styling for Notebook Headers
def display_header(text):
    display(HTML(f'<h3 style="color:#808080; text-align:left;">{text}</h3>'))

def display_subheader(text):
    display(HTML(f'<h4 style="color:#808080; text-align:left;">{text}</h4>'))

# Global function for styling tables 
def style_table(df):
    return df.style.set_properties(**{
        'text-align': 'left',
        'border': '1px solid black'
    }).set_table_styles([{
        'selector': 'th',
        'props': [('background-color', '#2a3f5f'), ('color', 'white'), ('font-weight', 'bold')]
    }])


#### **Loading Dataset**

In [None]:
file_path = r"Sample_Dataset.csv"
df = pd.read_csv(file_path)
display_subheader("Dataset uploaded successfully.")


#### **Dataset Structure**

In [101]:
# # Display Dataset Overview (First 5 Rows)
# display_header("Dataset Overview: ")
# display(style_table(df.head()))  # Display first 5 rows

# # Display Dataset Information in Styled Table
# display_header("Dataset Information: ")
# info_df = pd.DataFrame({
#     "Columns": df.columns,  # Show only column names
#     "Non-Null Count": df.notnull().sum(),
#     "Data Type": df.dtypes.astype(str)

# }).reset_index(drop=True)  # Remove the index
# display(style_table(info_df))


In [102]:
# Display Dataset Shape
display(HTML(f"<h5 style='color: Gray;'> Dataset Shape : {df.shape[0]} rows, {df.shape[1]} columns</h5>"))


#### **Identifying Missing and Duplicate Data**

In [103]:
# Display Missing Values
display_subheader("Missing Values : ")
missing_df = pd.DataFrame({"Columns": df.columns, "Missing Values": df.isnull().sum(), "Percentage": (df.isnull().sum() / len(df)) * 100}).reset_index(drop=True)  # Remove the index



In [104]:
# Display Duplicate Rows
display_subheader(f"Duplicate Rows: {df.duplicated().sum()}")


#### **Basic Statistics**

In [105]:
# Display Descriptive Statistics
display_subheader("Descriptive Statistics : ")

desc_stats = df.describe().transpose().reset_index()
desc_stats.rename(columns={"index": "Columns"}, inplace=True)



In [106]:
# Display Unique Value Counts for Categorical Features
display_subheader("Categorical Variable Summary : ")

categorical_cols = df.select_dtypes(include=['object']).columns

cat_summary = pd.DataFrame({
    "Columns": categorical_cols,
    "Unique Values": [df[col].nunique() for col in categorical_cols],
    "Most Frequent Value": [df[col].mode()[0] for col in categorical_cols]
})




## **2. Exploratory Data Analysis (EDA)**

#### **Understanding the Target Variable (ATTORNEY)**

**Distribution Plot of ATTORNEY**

In [107]:
# # Visualizing the distribution of the target variable
# plt.figure(figsize=(6,4))
# sns.countplot(x=df['ATTORNEY'], palette=["#1E88E5", "#D32F2F"])
# plt.xlabel("Attorney Involvement (0 = No, 1 = Yes)")
# plt.ylabel("Count")
# plt.title("Distribution of Attorney Involvement in Claims")
# plt.xticks(ticks=[0,1], labels=["No Attorney", "Attorney"])
# plt.show()


**Pie Chart for Proportion**

In [108]:
# # Pie Chart for Attorney vs. No Attorney cases with Legend
# plt.figure(figsize=(6,6))
# colors = ["#1E88E5", "#D32F2F"]
# labels = ["No Attorney", "Attorney"]

# # Create pie chart
# wedges, texts, autotexts = plt.pie(
#     df['ATTORNEY'].value_counts(), 
#     autopct='%1.1f%%', 
#     startangle=90, 
#     colors=colors, 
#     labels=labels
# )

# # Add legend
# plt.legend(wedges, labels, title="Claim Status", loc="upper right", bbox_to_anchor=(1.3, 0.9))

# plt.ylabel('')
# plt.title("Proportion of Claims with Attorney Involvement")
# plt.show()


#### **Univariate Analysis - Categorical Features**

**Bar Chart for Categorical Features**

In [None]:
# # Improved Layout for Categorical Features
# fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# axes = axes.flatten()  

# categorical_cols = []

# for i, col in enumerate(categorical_cols):
#     sns.countplot(x=df[col], ax=axes[i], palette="muted")
#     axes[i].set_title(f"Distribution of {col}")
#     axes[i].set_xlabel(col)
#     axes[i].set_ylabel("Count")
#     axes[i].tick_params(axis='x', rotation=45)  # Rotate labels for better visibility

# # Hide the last subplot if unused
# if len(categorical_cols) < len(axes):
#     fig.delaxes(axes[-1])

# plt.tight_layout()
# plt.show()


#### **Univariate Analysis - Numerical Features**

**Histograms & KDE Plots for Numerical Features**

In [None]:
# # Identifying Numerical Columns (Excluding Target Variable)
# numerical_cols = []

# # Plot Histograms Only
# display_header("Histograms of Numerical Features")

# fig, axes = plt.subplots(3, 3, figsize=(15, 12))  # 3x3 grid layout for clear visibility
# axes = axes.flatten()

# for i, col in enumerate(numerical_cols):
#     sns.histplot(df[col], bins=30, ax=axes[i], color="#1E88E5")
#     axes[i].set_title(f"Histogram of {col}")
#     axes[i].set_xlabel(col)
#     axes[i].set_ylabel("Frequency")

# for j in range(i+1, len(axes)):
#     fig.delaxes(axes[j])

# plt.tight_layout()
# plt.show()


In [111]:
# # Plot KDE Plots 
# display_header("KDE Plots of Numerical Features")

# fig, axes = plt.subplots(3, 3, figsize=(15, 12))  # 3x3 grid layout for clear visibility
# axes = axes.flatten()

# for i, col in enumerate(numerical_cols):
#     sns.kdeplot(df[col], fill=True, ax=axes[i], color="#D32F2F")
#     axes[i].set_title(f"KDE Plot of {col}")
#     axes[i].set_xlabel(col)
#     axes[i].set_ylabel("Density")

# # Hide empty plots if fewer than 9 numerical columns
# for j in range(i+1, len(axes)):
#     fig.delaxes(axes[j])

# plt.tight_layout()
# plt.show()


#### **Missing & Null Values Analysis**

**Bar Chart for Missing Values**

In [112]:
# # Display Missing Values Bar Chart
# missing_count = df.isnull().sum()
# missing_count = missing_count[missing_count > 0].sort_values(ascending=False)

# plt.figure(figsize=(10,5))
# sns.barplot(x=missing_count.index, y=missing_count.values, palette="Reds")
# plt.xticks(rotation=45)
# plt.ylabel("Missing Values Count")
# plt.xlabel("Columns")
# plt.title("Missing Values Per Column")
# plt.show()


#### **Bivariate Analysis (Analyzing Relationships Between Features & ATTORNEY)**

**Categorical Features vs. ATTORNEY**

**Grouped Bar Chart for Categorical Features**

In [None]:
# # Define Categorical Features
# categorical_cols = []

# # Adjust layout for better spacing
# display_header("Categorical Features vs. Attorney Involvement")

# fig, axes = plt.subplots(3, 1, figsize=(10, 15))  # 3-row layout for better spacing

# for i, col in enumerate(categorical_cols):
#     sns.countplot(x=df[col], hue=df["ATTORNEY"], palette="muted", ax=axes[i])
#     axes[i].set_title(f"{col} vs. Attorney Involvement", fontsize=14)
#     axes[i].set_xlabel(col, fontsize=12)
#     axes[i].set_ylabel("Count", fontsize=12)
#     axes[i].tick_params(axis='x', rotation=45)

# plt.tight_layout()
# plt.show()


**Stacked Bar Chart for Categorical Features**

In [114]:
# # Adjust layout for better spacing
# display_header("Stacked Bar Chart: Categorical Features vs. Attorney Involvement")

# fig, axes = plt.subplots(3, 1, figsize=(10, 15))  # 3-row layout for better visibility

# for i, col in enumerate(categorical_cols):
#     cross_tab = pd.crosstab(df[col], df["ATTORNEY"], normalize="index") * 100
#     cross_tab.plot(kind="bar", stacked=True, colormap="Paired", ax=axes[i])
#     axes[i].set_title(f"{col} vs. Attorney Involvement (Stacked)", fontsize=14)
#     axes[i].set_xlabel(col, fontsize=12)
#     axes[i].set_ylabel("Percentage (%)", fontsize=12)
#     axes[i].legend(title="Attorney", labels=["No Attorney", "Attorney"])

# plt.tight_layout()
# plt.show()


**Bivariate Analysis (Numerical Features vs. ATTORNEY)**

**Boxplots for Numerical Features vs. ATTORNEY**

In [None]:
# # Define Numerical Columns (Excluding Categorical & Target Variable)
# numerical_cols = []

# # Boxplots for Numerical Features
# display_header("Boxplots: Numerical Features vs. Attorney Involvement")

# fig, axes = plt.subplots(2, 2, figsize=(14, 10))  # 2x2 grid for clear visualization
# axes = axes.flatten()

# for i, col in enumerate(numerical_cols):
#     sns.boxplot(x=df["ATTORNEY"], y=df[col], palette="coolwarm", ax=axes[i])
#     axes[i].set_title(f"{col} vs. Attorney Involvement", fontsize=14)
#     axes[i].set_xlabel("Attorney Involvement (0 = No, 1 = Yes)")
#     axes[i].set_ylabel(col)

# plt.tight_layout()
# plt.show()


**Violin Plots for Numerical Features vs. ATTORNEY**

In [116]:
# # Violin Plots for Numerical Features
# display_header("Violin Plots: Numerical Features vs. Attorney Involvement")

# fig, axes = plt.subplots(2, 2, figsize=(14, 10))  # 2x2 layout for better spacing
# axes = axes.flatten()

# for i, col in enumerate(numerical_cols):
#     sns.violinplot(x=df["ATTORNEY"], y=df[col], palette="muted", ax=axes[i])
#     axes[i].set_title(f"{col} vs. Attorney Involvement", fontsize=14)
#     axes[i].set_xlabel("Attorney Involvement (0 = No, 1 = Yes)")
#     axes[i].set_ylabel(col)

# plt.tight_layout()
# plt.show()


#### **Correlation Analysis**

**Correlation Heatmap**

In [117]:
# # Selecting Only Numerical Columns (Excluding Identifier Columns)
# numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# # Remove 'CASENUM' from the list (if present)
# if 'CASENUM' in numerical_columns:
#     numerical_columns.remove('CASENUM')

# # Compute Correlation Matrix
# corr_matrix = df[numerical_columns].corr()

# plt.figure(figsize=(10,6))
# sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
# plt.title("Correlation Heatmap")
# plt.show()


**Pairplot for Numerical Features**

In [None]:
# # Selecting Key Numerical Features for Pairplot
# pairplot_cols = []

# # Plot Pairplot
# display_header("Pairplot for Numerical Features")

# sns.pairplot(df[pairplot_cols], hue="ATTORNEY", diag_kind="kde", palette="muted")
# plt.show()


**Scatterplots for Categorical vs. Numerical Feature Interactions**

In [None]:
# # Scatterplots for Categorical vs. Numerical Features

# # Selecting Categorical & Numerical Features for Scatterplots
# categorical_features = [""]
# numerical_features = [""]

# # Creating a Grid of Scatterplots
# fig, axes = plt.subplots(len(categorical_features), len(numerical_features), figsize=(15, 12))
# fig.suptitle("Scatterplots: Categorical vs. Numerical Features", fontsize=14, fontweight="bold")

# # Plot Scatterplots
# for i, cat_feature in enumerate(categorical_features):
#     for j, num_feature in enumerate(numerical_features):
#         sns.scatterplot(data=df, x=num_feature, y=cat_feature, hue="ATTORNEY", alpha=0.7, ax=axes[i, j])
#         axes[i, j].set_xlabel(num_feature)
#         axes[i, j].set_ylabel(cat_feature)

# plt.tight_layout(rect=[0, 0, 1, 0.96])
# plt.show()


#### **Skewness of Numerical Features**

In [120]:
# # Compute Skewness for All Numerical Features
# display_header("Skewness of Numerical Features :")
# skew_values = df[numerical_columns].skew().reset_index()
# skew_values.columns = ["Columns", "Skewness"]
# skew_values["Skewness"] = skew_values["Skewness"].round(2)

# # Display Skewness Table Using style_table() Function (Without HTML)
# display(style_table(skew_values))


**Visualizing Skewness (Histograms & KDE Plots)**

In [121]:
# # Plot Histograms & KDE for Skewness Analysis
# display_subheader("Skewness Analysis: Histograms & KDE Plots")

# # Calculate the number of rows and columns needed for the subplots
# num_cols = 3  # Number of columns in the grid
# num_rows = int(np.ceil(len(numerical_columns) / num_cols))  # Calculate the number of rows needed

# fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))  # Create the subplots grid
# axes = axes.flatten()  # Flatten the axes array for easier iteration

# for i, col in enumerate(numerical_columns):
#     sns.histplot(df[col], kde=True, ax=axes[i], bins=30, color="teal")
#     axes[i].set_title(f"Distribution of {col}")

# # Hide any extra empty subplots
# for j in range(i + 1, len(axes)):
#     fig.delaxes(axes[j])

# plt.tight_layout()
# plt.show()

#### **Handling Missing Values**

**Handling Missing Values by Type**

In [122]:
df_clean = df.fillna(df.median(numeric_only=True))
# df_clean

In [123]:
# df_clean.isnull().sum()

In [None]:
scale=StandardScaler()

df_clean=df_clean.drop(["CASENUM"], axis=1)
pol_type={"Third-Party": 1, "Comprehensive": 0}
drive_rec={"Clean": 0, "Minor Offenses": 1, "Major Offenses": 2}
acc_sev={"Minor": 0, "Moderate": 1, "Severe": 2}
df_clean["Policy_Type"] = df_clean["Policy_Type"].map(pol_type)
df_clean["Driving_Record"] = df_clean["Driving_Record"].map(drive_rec)
df_clean["Accident_Severity"] = df_clean["Accident_Severity"].map(acc_sev)
# df_clean

#### **Outlier Detection**

**Detect Outliers Using Boxplots**

In [125]:
# Selecting Only Numerical Columns (Excluding Identifier Columns)
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# # Plot Boxplots for Outlier Detection
# display_header("Boxplots for Outlier Detection")

# num_features = len(numerical_columns) - 1  # Exclude 'ATTORNEY' (Target Variable)
# rows = (num_features // 3) + 1  

# fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows)) 
# axes = axes.flatten()  

# for i, col in enumerate(numerical_columns):
#     if col != "ATTORNEY":  # Exclude Target Variable
#         sns.boxplot(y=df[col], ax=axes[i], color="lightblue")
#         axes[i].set_title(f"Boxplot of {col}")

# # Hide any extra empty subplots
# for j in range(i + 1, len(axes)):
#     fig.delaxes(axes[j])

# plt.tight_layout()
# plt.show()


**Implementing IQR for Outlier Detection**

In [None]:
# Apply IQR Method for Outlier Detection
display_header("Outlier Detection using IQR Method")

# Features to Apply IQR Skipping Binary Columns
iqr_features = []

outlier_counts = {}

for col in iqr_features:
    Q1 = df[col].quantile(0.25)  # 25th percentile
    Q3 = df[col].quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1  # Interquartile Range
    
    # Define Outlier Boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count Outliers
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_counts[col] = len(outliers)

# Convert to DataFrame for Display
outlier_df = pd.DataFrame(list(outlier_counts.items()), columns=["Feature", "Outlier Count"])

# Display Outlier Summary Table



In [127]:
# Function to remove outliers using IQR
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Select only numerical columns for outlier removal
num_cols = ["LOSS", "Claim_Amount_Requested", "Settlement_Amount", "CLMAGE"]  # Add more if needed
df_clean = remove_outliers(df_clean, num_cols)

# df_clean

## **3. Feature Engineering and Data Preprocessing**

In [128]:
# Create new calculated columns
df_clean["set-loss"] = (df_clean["Settlement_Amount"] - df_clean["LOSS"]) / df_clean["Settlement_Amount"]
df_clean["claim-loss"] = (df_clean["Claim_Amount_Requested"] - df_clean["LOSS"]) / df_clean["Claim_Amount_Requested"]
df_clean["claim-set"] = (df_clean["Claim_Amount_Requested"] - df_clean["Settlement_Amount"]) / df_clean["Claim_Amount_Requested"]

# Drop original columns
df_trial1 = df_clean.drop(["LOSS", "Claim_Amount_Requested", "Settlement_Amount"], axis=1)

# Separate features (X) and target variable (y)
X = df_trial1.drop(["ATTORNEY"], axis=1)
y = df_trial1["ATTORNEY"]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # No reshaping needed

# Convert back to DataFrame with correct column names
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [129]:
# plt.figure(figsize=(15, 10))  # Set figure size
# sns.heatmap(df_trial1.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
# plt.title("Correlation Heatmap", fontsize=14)
# plt.show()

## **4. Feature Selection & Dimensionality Reduction**

In [130]:
# Apply PCA to find optimal components
pca = PCA()
pca.fit(X_scaled)

# Compute cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# # Plot explained variance
# plt.figure(figsize=(8, 5))
# plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
# plt.xlabel("Number of Components")
# plt.ylabel("Cumulative Explained Variance")
# plt.title("Choosing the Optimal Number of PCA Components")
# plt.axhline(y=0.95, color='r', linestyle='--')  # 95% variance threshold
# plt.grid()
# plt.show()

In [131]:
# Find the optimal number of components
optimal_n = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Optimal number of components: {optimal_n}")

# Apply PCA with optimal components
pca = PCA(n_components=optimal_n)
principal_components = pca.fit_transform(X_scaled)

# # Convert to DataFrame
# pca_df = pd.DataFrame(principal_components, columns=[f'PC{i+1}' for i in range(optimal_n)])

# # Scatter plot of first two components
# plt.figure(figsize=(8, 6))
# sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'])
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.title('PCA Scatter Plot')
# plt.grid()
# plt.show()

# # Explained variance ratio plot
# plt.figure(figsize=(8, 5))
# plt.plot(range(1, optimal_n + 1), np.cumsum(pca.explained_variance_ratio_), marker='o')
# plt.xlabel("Number of Components")
# plt.ylabel("Cumulative Explained Variance")
# plt.title("Explained Variance vs. Number of Components")
# plt.grid()
# plt.show()

Optimal number of components: 10


## **5. Model Building & Evaluation**

**Splitting the Dataset**

In [132]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
x=pca_df
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1069, 10), (268, 10), (1069,), (268,))

In [133]:
# Verify the class distribution in both sets
train_class_distribution = pd.DataFrame(y_train.value_counts(normalize=True) * 100).reset_index()
train_class_distribution.columns = ['Class', 'Percentage']
test_class_distribution = pd.DataFrame(y_test.value_counts(normalize=True) * 100).reset_index()
test_class_distribution.columns = ['Class', 'Percentage']

# Display Class Distributions
display_header("Class Distribution in Train Set")
display(style_table(train_class_distribution))

display_header("Class Distribution in Test Set")
display(style_table(test_class_distribution))


Unnamed: 0,Class,Percentage
0,0,51.543499
1,1,48.456501


Unnamed: 0,Class,Percentage
0,1,50.373134
1,0,49.626866


In [134]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grids for each model
param_grids = {
    'xgb': {'n_estimators': [100, 200], 'max_depth': [3, 6], 'learning_rate': [0.01, 0.1]},
    'lgb': {'n_estimators': [100, 200], 'max_depth': [3, 6], 'learning_rate': [0.01, 0.1]},
    'rf': {'n_estimators': [100, 200], 'max_depth': [None, 10]},
    'dt': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]},
    'lr': {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
    'svc': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'knn': {'n_neighbors': [3, 5, 7], 'metric': ['euclidean', 'manhattan']},
    'nb': {}  # No hyperparameters for Naive Bayes
}

# Models dictionary
models_dict = {
    'xgb': xgb.XGBClassifier(),
    'lgb': lgb.LGBMClassifier(),
    'rf': RandomForestClassifier(),
    'dt': DecisionTreeClassifier(),
    'lr': LogisticRegression(),
    'svc': SVC(),
    'knn': KNeighborsClassifier(),
    'nb': GaussianNB()
}

In [135]:
best_params = {}
for name, model in models_dict.items():
    print(f"Running GridSearch for {name}...")
    grid = GridSearchCV(model, param_grids.get(name, {}), cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(x_train, y_train)
    best_params[name] = grid.best_params_
    print(f"Best parameters for {name}: {grid.best_params_}")

# Print best parameters
print("Best parameters for each model:")
for model_name, params in best_params.items():
    print(f"{model_name}: {params}")

Running GridSearch for xgb...
Best parameters for xgb: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Running GridSearch for lgb...
[LightGBM] [Info] Number of positive: 518, number of negative: 551
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000275 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1069, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.484565 -> initscore=-0.061760
[LightGBM] [Info] Start training from score -0.061760
Best parameters for lgb: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Running GridSearch for rf...
Best parameters for rf: {'max_depth': 10, 'n_estimators': 200}
Running GridSearch for dt...
Best parameters for dt: {'max_depth': None, 'min_samples_split': 2}
Running GridSearch for lr...
Best parameters for lr: {'C': 0.1, 'penalty': 'l1', 'solver': 

In [136]:
best_params

{'xgb': {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200},
 'lgb': {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100},
 'rf': {'max_depth': 10, 'n_estimators': 200},
 'dt': {'max_depth': None, 'min_samples_split': 2},
 'lr': {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'},
 'svc': {'C': 0.1, 'kernel': 'linear'},
 'knn': {'metric': 'manhattan', 'n_neighbors': 7},
 'nb': {}}

In [137]:
model_xgb = xgb.XGBClassifier(**best_params['xgb'])
model_lgb = lgb.LGBMClassifier(**best_params['lgb'])
model_rf = RandomForestClassifier(**best_params['rf'])
model_dt = DecisionTreeClassifier(**best_params['dt'])
model_lr = LogisticRegression(**best_params['lr'])
model_svc = SVC(**best_params['svc'])
model_knn = KNeighborsClassifier(**best_params['knn'])
model_nb = GaussianNB()  # No params needed
models=[model_xgb,model_lgb,model_rf,model_dt,model_lr,model_svc,model_knn,model_nb]
model_names = ['XGBoost', 'LightGBM', 'RandomForest', 'DecisionTree',
               'LogisticRegression', 'SVC', 'KNN', 'NaiveBayes']

In [138]:
results = []

# Train and evaluate models
for name, model in zip(model_names, models):
    print(f"Training and Evaluating {name}...")
    
    # Fit the model on training data
    model.fit(x_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(x_test)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results.append([name, accuracy, precision, recall, f1])

# Create a DataFrame for better visualization
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# # Print the results table
# print(results_df)

# # Optionally, display as a sorted table by F1 Score
# results_df = results_df.sort_values(by='F1 Score', ascending=False)
# print("\nSorted Results by F1 Score:")
# print(results_df)

Training and Evaluating XGBoost...
Training and Evaluating LightGBM...
[LightGBM] [Info] Number of positive: 518, number of negative: 551
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1069, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.484565 -> initscore=-0.061760
[LightGBM] [Info] Start training from score -0.061760
Training and Evaluating RandomForest...
Training and Evaluating DecisionTree...
Training and Evaluating LogisticRegression...
Training and Evaluating SVC...
Training and Evaluating KNN...
Training and Evaluating NaiveBayes...


In [139]:
# results_df

In [140]:
# Normalize the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Define the neural network model
def create_nn():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary classification
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Create and train the model
model_nn = create_nn()
history = model_nn.fit(x_train_scaled, y_train, 
                       validation_data=(x_test_scaled, y_test),
                       epochs=50, batch_size=32, verbose=1)

# Predict on test data
y_pred_nn = (model_nn.predict(x_test_scaled) > 0.5).astype(int)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred_nn)
precision = precision_score(y_test, y_pred_nn)
recall = recall_score(y_test, y_pred_nn)
f1 = f1_score(y_test, y_pred_nn)

# Print results
# print(f"Neural Network Performance:\nAccuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5197 - loss: 0.7014 - val_accuracy: 0.4963 - val_loss: 0.6904
Epoch 2/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5342 - loss: 0.6898 - val_accuracy: 0.5224 - val_loss: 0.6894
Epoch 3/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5519 - loss: 0.6864 - val_accuracy: 0.5261 - val_loss: 0.6885
Epoch 4/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5428 - loss: 0.6801 - val_accuracy: 0.5112 - val_loss: 0.6879
Epoch 5/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5836 - loss: 0.6775 - val_accuracy: 0.5224 - val_loss: 0.6884
Epoch 6/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5501 - loss: 0.6793 - val_accuracy: 0.5299 - val_loss: 0.6885
Epoch 7/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━

In [141]:
# Append Neural Network results to results_df
nn_results = pd.DataFrame([["Neural Network", 0.5037, 0.5085, 0.4444, 0.4743]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

results_df = pd.concat([results_df, nn_results], ignore_index=True)

# # Print updated results
# print("\nUpdated Results with Neural Network:")
# results_df

In [142]:
# from sklearn.metrics import roc_curve, roc_auc_score
# import matplotlib.pyplot as plt

# # Get predicted probabilities
# y_prob = model.predict_proba(x_test)[:, 1]  # Probability for class 1

# # Calculate FPR, TPR, and threshold values
# fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# # Compute ROC-AUC score
# roc_auc = roc_auc_score(y_test, y_prob)

# # Plot the curve
# plt.figure(figsize=(8, 6))
# plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color='blue')
# plt.plot([0, 1], [0, 1], linestyle="--", color='grey')  # Diagonal line
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC-AUC Curve")
# plt.legend(loc="lower right")
# plt.grid(True)
# plt.show()


In [143]:

# # Set positions for bars
# x = np.arange(len(results_df['Model']))

# # Define width of bars
# bar_width = 0.2

# # Plot bars for each metric
# plt.figure(figsize=(12, 6))
# plt.bar(x - 1.5 * bar_width, results_df['Accuracy'], width=bar_width, label='Accuracy', color='blue')
# plt.bar(x - 0.5 * bar_width, results_df['Precision'], width=bar_width, label='Precision', color='green')
# plt.bar(x + 0.5 * bar_width, results_df['Recall'], width=bar_width, label='Recall', color='orange')
# plt.bar(x + 1.5 * bar_width, results_df['F1 Score'], width=bar_width, label='F1 Score', color='red')

# # Set labels and title
# plt.xlabel('Models')
# plt.ylabel('Scores')
# plt.title('Model Performance Comparison')
# plt.xticks(ticks=x, labels=results_df['Model'], rotation=45)
# plt.legend()

# # Show plot
# plt.tight_layout()
# plt.show()


In [144]:
# import joblib

# # Assuming `logistic_regression_model` is your trained model
# joblib.dump(model_lr, 'best_model.pkl')
