In [None]:
!pip install category_encoders
!pip install optuna

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
import category_encoders as ce
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler,MinMaxScaler,StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import optuna
from sklearn.model_selection import cross_val_score
import optuna

pd.set_option('display.max_columns', None)  # will display all columns

In [None]:
data = pd.read_csv('/content/drive/MyDrive/RTA Dataset.csv')
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data['Accident_severity'].value_counts()

# **CONVERTING THE DATA TYPES**

1. Time column - object to datetime datatype
2. All other object columns to category datatype

In [None]:
data['Time'] = pd.to_datetime(data['Time'])

In [None]:
object_columns = data.select_dtypes(include='object').columns
object_columns

In [None]:
for col in object_columns:
  data[col] = data[col].astype('category')

In [None]:
data['Casualty_class'].value_counts()

In [None]:
data.info()

# **DUPILICATES AND DESCRIPTIONS**

1.Check for duplicates - if so remove them

2.Know the decriptions for both numerical and categorical values

In [None]:
data.duplicated().sum()

In [None]:
data.describe()

In [None]:
data.describe(include = 'category')

**GRAPHS**

# **UNIVARIATE**

---
1.Numerical - hist, box

2.Categorical - count

3.datetime - extract numeric - hist , box



In [None]:
data['Hour'] = data['Time'].dt.hour   # created a new column that has the HOUR at which accidents happend
data.drop('Time', axis=1, inplace=True)


In [None]:
data.info()

In [None]:
numerical_columns = ['Number_of_vehicles_involved',	'Number_of_casualties', 'Hour' ]
category_cols= object_columns

In [None]:
def plot_numeric(col):
  sns.histplot(data=data,x=col)
  plt.show()


In [None]:
for col in numerical_columns:
  plot_numeric(col)

**INFERANCES**

Graph - 1

*   Involvement of **Two vehicles** is the maximum and **Five** being the minimum

Graph - 2

*   **One** casuality is the maximum and **Eight** is the minimum

Graph - 3

*  Most of the accidents happend around **12 to 18 hrs and 17** being the maximum






In [None]:
def plot_numeric(col):
  sns.boxplot(data=data,y=col)
  plt.show()

In [None]:
for col in numerical_columns:
  plot_numeric(col)

In [None]:
sns.countplot(data=data,x='Work_of_casuality')
plt.xticks(rotation=90)
plt.show()

**INFERANCE**

Drivers are most affected by accidents


In [None]:
sns.countplot(data=data,x='Sex_of_driver')
plt.show()

**INFERANCE**

Male gender causes most of the accidents

# **BIVARIATE**

---
1.Numeric vs Target - numeric vs category - count(if range is too small),box

2.Category vs Target - category vs category - count

In [None]:
sns.boxplot(x='Accident_severity', y='Number_of_vehicles_involved', data=data)

**INFERANCE**

* Fatal,Serious injuries happen when very less number of vehicles involved










In [None]:
sns.countplot(hue='Accident_severity', x='Number_of_vehicles_involved', data=data)

**INFERANCE**

The Accident severity is maximum when **Two vehicles** are involved

In [None]:
sns.countplot(x='Work_of_casuality', hue='Accident_severity', data=data)
plt.xticks(rotation=90)
plt.show()

# **MULTIVARIATE**

* Heatmap


In [None]:
corr = data[numerical_columns].corr()
corr

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(corr, cmap='RdBu_r', annot=True, vmax=1, vmin=-1)
plt.show()

# analysis

In [None]:
data.info()

In [None]:
from sklearn.model_selection import train_test_split

y = data['Accident_severity']
X = data.drop('Accident_severity' , axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_train.describe()

In [None]:
X_test.describe()

In [None]:
X_train.info()

# **MISSING VALUE HANDLING**

In [None]:
p = 9852*0.8
p

In [None]:
X_train.isnull().sum()


In [None]:
q = 2464*0.8
q

In [None]:
X_test.isnull().sum()

In [None]:
import missingno as msno

msno.bar(X_train)


In [None]:
msno.bar(X_test)

**CATEGORICAL VALUES**

In [None]:
cat_cols = X_train.select_dtypes(include='category').columns
cat_cols

**NOTE**

*  As the columns are category data type , and we need to impute Unknown as missing value , we need to introduce that as a category first .

*  We can impute values which are already the categories of that column

In [None]:
# Step 1: Add 'Unknown' as a new category only if it's not already present
X_train[cat_cols] = X_train[cat_cols].apply(lambda col: col.cat.add_categories('Unknown') if 'Unknown' not in col.cat.categories else col)

# Step 2: Fill the missing values with 'Unknown'
X_train[cat_cols] = X_train[cat_cols].fillna('Unknown')


In [None]:
X_train.info()

**NOTE**

If the same column have unknown , N/A values --> change to a common name

In [None]:
X_train.replace(['na','N/A','unknown'], 'Unknown', inplace=True)
X_train

In [None]:
X_test[cat_cols] = X_test[cat_cols].apply(lambda col: col.cat.add_categories('Unknown') if 'Unknown' not in col.cat.categories else col)

X_test[cat_cols] = X_test[cat_cols].fillna('Unknown')

In [None]:
X_test.replace(['na','N/A','unknown'], 'Unknown', inplace=True)
X_test

In [None]:
for col in cat_cols:
  print(X_test[col].value_counts())
  print()

In [None]:
X_test.info()

NOTE

By observing , the column Casualty_severity has numerical and missing values only , so we can change the entire column to numeric .



In [None]:
X_train['Casualty_severity'].value_counts()

**NOTE**

Use  --> to_numeric() if the column has missing/unknown values and you have to impute by any methods or use astype() for basic conversions.

we cant impute any values in unknown , so I label them as 0.

In [None]:
X_train['Casualty_severity'].replace('Unknown' , '0' , inplace=True)
X_test['Casualty_severity'].replace('Unknown' , '0' , inplace=True)


**NOTE**

The warning is because I changed the 'Unknown' category to 0 , so if you want change the name of the category itself , use the above said method

Here , the column is changed to int , so I dont worry about categories

In [None]:
X_train['Casualty_severity'].value_counts()

In [None]:
X_train['Casualty_severity'] = X_train['Casualty_severity'].astype('int64')
X_test['Casualty_severity'] = X_test['Casualty_severity'].astype('int64')

In [None]:
X_train.info()

In [None]:
X_test.info()

**NUMERICAL VALUES**

In [None]:
num_cols = X_train.select_dtypes(include= int ).columns
num_cols


In [None]:
for col in num_cols:
  print(f"{col} - ",X_train[col].isnull().sum())

# **FINDING AND HANDLING OUTLIERS**

As this dataset has no absurd values , we just mark the outliers rather that removing

In [None]:
num_cols

In [None]:
for col in num_cols:
  sns.boxplot(data=X_train, y=col)
  plt.show()

In [None]:
for col in num_cols:
  print(X_train[col].value_counts())
  print()


In [None]:
X_train.describe()

In [None]:
X_test.describe()


In [None]:
Q1 = X_train[num_cols].quantile(0.25)
Q3 = X_train[num_cols].quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


outliers = ((X_train[num_cols] < lower_bound) | (X_train[num_cols] > upper_bound))

print(outliers)

In [None]:
outliers_ = ((X_test[num_cols] < lower_bound) | (X_test[num_cols] > upper_bound))
print(outliers_)

In [None]:
print(lower_bound)
print(upper_bound)

In [None]:
lower_bound['Number_of_vehicles_involved']

In [None]:
X_train

In [None]:
for col in num_cols:
  X_train[f'{col}_flag_outliers'] = np.where((X_train[col] < lower_bound[col]) | (X_train[col] > upper_bound[col]), 1, 0)


In [None]:
X_train

In [None]:
for col in num_cols:
  X_test[f'{col}_flag_outliers'] = np.where((X_test[col] < lower_bound[col]) | (X_test[col] > upper_bound[col]), 1, 0)


In [None]:
X_test

# **CATEGORICAL VARIABLE ENCODING**


In [None]:
y_train.value_counts()

ENCODING THE TARGET VARIABLE

In [None]:
y_test.isnull().sum()

In [None]:
Accident_severity_mapping = {'Slight Injury' : 0 , 'Serious Injury' : 1 , 'Fatal injury' : 2}
y_train = y_train.map(Accident_severity_mapping)
y_test = y_test.map(Accident_severity_mapping)

In [None]:
y_train.info()

In [None]:
y_test.info()

In [None]:
y_train.value_counts()

In [None]:
cat_cols

In [None]:
for col in cat_cols:
  print(X_train[col].value_counts())
  print()


In [None]:
Age_band_of_driver_mappling = {
    'Under 18': 1,
    '18-30': 2,
    '31-50': 3,
    'Over 51': 4,
    'Unknown': 5
}

X_train['Age_band_of_driver'] = X_train['Age_band_of_driver'].map(Age_band_of_driver_mappling)
X_test['Age_band_of_driver'] = X_test['Age_band_of_driver'].map(Age_band_of_driver_mappling)

In [None]:
X_train['Age_band_of_driver'].value_counts()

In [None]:
Service_year_of_vehicle_mapping = {
    'Below 1yr': 1,
    '1-2yr': 2,
    '2-5yrs': 3,
    '5-10yrs': 4,
    'Above 10yr': 5,
    'Unknown': 6
}

X_train['Service_year_of_vehicle'] = X_train['Service_year_of_vehicle'].map(Service_year_of_vehicle_mapping)
X_test['Service_year_of_vehicle'] = X_test['Service_year_of_vehicle'].map(Service_year_of_vehicle_mapping)

In [None]:
X_train['Service_year_of_vehicle'].value_counts()

In [None]:
# Mapping for ordinal encoding
Age_band_of_casualty_mapping = {
    'Unknown': 0,      # Assuming Unknown is the lowest
    'Under 18': 1,
    '18-30': 2,
    '31-50': 3,
    'Over 51': 4,
    '5': 5             # Assuming '5' refers to a specific age band, possibly a mistake. Make sure this is correct.
}


X_train['Age_band_of_casualty'] = X_train['Age_band_of_casualty'].map(Age_band_of_casualty_mapping)
X_test['Age_band_of_casualty'] = X_test['Age_band_of_casualty'].map(Age_band_of_casualty_mapping)

In [None]:
X_train['Age_band_of_casualty'].value_counts()

In [None]:
X_train['Area_accident_occured'].replace('  Recreational areas' ,  'Recreational areas' , inplace=True)
X_test['Area_accident_occured'].replace('  Recreational areas' ,  'Recreational areas' , inplace=True)

In [None]:
X_train['Fitness_of_casuality'].replace('NormalNormal' , 'Normal' , inplace = True )
X_test['Fitness_of_casuality'].replace('NormalNormal' , 'Normal' , inplace = True )

In [None]:
exclude_cols = ['Age_band_of_driver', 'Service_year_of_vehicle' , 'Age_band_of_casualty']   # ordinally encoded already

# Select categorical columns and exclude specified ones
categorical_cols = X_train.select_dtypes(include=['category']).columns.difference(exclude_cols).tolist()



In [None]:
categorical_cols

In [None]:
for col in categorical_cols:
  print(X_train[col].value_counts())
  print()

In [None]:
y_train = y_train.astype(int)

In [None]:
for col in categorical_cols:
  encoder = ce.TargetEncoder(cols= col , handle_unknown='value', handle_missing='value', verbose=True )

  X_train_encoded = encoder.fit_transform(X_train[col], y_train)

  X_test_encoded = encoder.transform(X_test[col])

  X_train[col] = X_train_encoded

  X_test[col] = X_test_encoded

In [None]:
X_train

In [None]:
X_train.info()

In [None]:
exclude_cols = ['Age_band_of_driver', 'Service_year_of_vehicle' , 'Age_band_of_casualty']

for col in exclude_cols:
  X_train[col] = X_train[col].astype(int)
  X_test[col] = X_test[col].astype(int)

In [None]:
X_train.info()

In [None]:
X_test.info()

RESAMPLING

It’s crucial to only apply SMOTE on the training set. The test set should remain untouched to accurately evaluate your model's performance on data that reflects the original class distribution.

In [None]:
smote = SMOTE(random_state=42)


X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
y_train.value_counts()

In [None]:
y_train_resampled.value_counts()

# **FEATURE SCALING**

In [None]:
X_train_resampled.describe()

In [None]:
scaler = StandardScaler()

# 2. Fit on X_train and transform both X_train and X_test
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_resampled.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
X_train_scaled.describe()

In [None]:
logreg_ovr = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000 )

# Fit the model on the resampled training data
logreg_ovr.fit(X_train_scaled, y_train_resampled)

# Make predictions on the test data
y_pred = logreg_ovr.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

In [None]:
dt_model = DecisionTreeClassifier()
# Fit the model on the training data
dt_model.fit(X_train_scaled, y_train_resampled)

# Make predictions on the test data
y_pred = dt_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

**NOTE**

 By default, decision trees use the Gini impurity as the criterion

In [None]:

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model
rf_classifier.fit(X_train_scaled, y_train_resampled)

# Predict on the test set
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


In [None]:
# Reinitialize RandomForestClassifier with balanced class weights
rf_classifier = RandomForestClassifier(random_state=42, class_weight='balanced')

# Train the model
rf_classifier.fit(X_train_scaled, y_train_resampled)

# Predict on the test set
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


In [None]:
y_test.value_counts()

In [None]:
# Initialize the SVC model
svc_model = SVC(kernel='rbf')  # You can change the kernel to 'rbf','linear' , 'poly' ,or others depending on your need

# Train the model on the resampled training data
svc_model.fit(X_train_scaled, y_train_resampled)

# Make predictions on the test data
y_pred = svc_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

In [None]:

# Create the AdaBoost classifier
adaboost_clf = AdaBoostClassifier(n_estimators=100, random_state=42)   # estimators ->  specifies the number of weak classifiers (decision stumps) to use.

# Fit the model to the training data
adaboost_clf.fit(X_train_scaled, y_train_resampled)

# Make predictions
y_pred = adaboost_clf.predict(X_test_scaled)

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy:", test_accuracy)

print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred))


In [None]:

# Create the k-NN classifier with 'distance' weights
knn_clf = KNeighborsClassifier(n_neighbors=10, weights='distance', metric='manhattan')  # Distance metric can be [ euclidean , manhattan etc ]

# Fit the model to the training data
knn_clf.fit(X_train_scaled, y_train_resampled)

# Make predictions

y_pred_test = knn_clf.predict(X_test_scaled)

# Evaluate the model

test_accuracy = accuracy_score(y_test, y_pred_test)

print("Test Accuracy:", test_accuracy)

print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))


# weights is how you treat the k-neighbors to get maximum votes , 'distance' gives more weights for nearest neighbors [uniform , distance  etc ]

In [None]:
from xgboost import XGBClassifier


# Create the XGBoost classifier
xgb_clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model to the training data
xgb_clf.fit(X_train_scaled, y_train_resampled)

# Make predictions
y_pred_test = xgb_clf.predict(X_test_scaled)

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Test Accuracy:", test_accuracy)

print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))



# **TUNING HYPER PARAMETERS**

In [None]:
'''from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define the Random Forest model
rf_clf = RandomForestClassifier(random_state=42)

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Maximum tree depth
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],    # Minimum samples at leaf nodes
    'max_features': ['sqrt', 'log2'], # Number of features to consider for best split
}

# Set up the GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    scoring='accuracy',  # Use accuracy as the evaluation metric
    cv=5,                # 5-fold cross-validation
    verbose=2,           # Higher number for more detailed output
    n_jobs=-1            # Use all processors
)

# Perform the grid search
grid_search.fit(X_train_scaled, y_train_resampled)

# Get the best parameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on the test set
best_rf_clf = grid_search.best_estimator_           -> best estimator -> stores the best model with high performance from the searching we done .
y_pred_test = best_rf_clf.predict(X_test_scaled)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))   '''


In [None]:
'''from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Define the model
rf_clf = RandomForestClassifier(random_state=42)

# Define the parameter grid with ranges
param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=500, num=5)],  # Number of trees
    'max_depth': [None] + [int(x) for x in np.linspace(10, 50, num=5)],         # Tree depth
    'max_features': ['sqrt', 'log2', None],                                    # Max features to consider
    'bootstrap': [True, False]                                                 # Use bootstrap sampling
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_dist,
    n_iter= 10 ,                 # Number of parameter settings to sample
    scoring='accuracy',        # Metric to optimize
    cv=3,                      # 3-fold cross-validation
    verbose=2,                 # Higher value for detailed output
    random_state=42,           # Ensure reproducibility
    n_jobs=-1                  # Use all processors
)

# Perform the random search
random_search.fit(X_train_scaled, y_train_resampled)

# Access the best model and parameters
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Score:", random_search.best_score_)

# Evaluate on test data
best_rf_clf = random_search.best_estimator_
y_pred_test = best_rf_clf.predict(X_test_scaled)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))

'''


In [None]:
# Define the objective function
'''def objective(trial):
    # Suggest hyperparameters for optimization
    n_estimators = trial.suggest_int('n_estimators', 100, 500)  # Number of trees
    max_depth = trial.suggest_int('max_depth', 10, 50)          # Max depth of trees
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Create the model with suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42
    )

    # Cross-validation to evaluate performance
    score = cross_val_score(model, X_train_scaled, y_train_resampled, cv=2 , scoring='accuracy').mean()
    return score



# Create a study and optimize
study = optuna.create_study(direction='maximize')  # Maximize accuracy
study.optimize(objective, n_trials=10, timeout= 180)  # 10 trials or 3 minutes [if trial is started within time limit -> it finishes the trial even if limit is reached ]

# Best hyperparameters and score
print("Best Hyperparameters:", study.best_params)
print("Best Cross-Validation Score:", study.best_value)

# Train and evaluate the best model
best_params = study.best_params
best_model = RandomForestClassifier(**best_params, random_state=42)   # best_estimator_ is not available as gridsearchCV , ** to unpack the dictionary and set the corresponding values
best_model.fit(X_train_scaled, y_train_resampled)

y_pred_test = best_model.predict(X_test_scaled)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))


# optuna automatically gives detailed info as below

'''


In [None]:
'''# Define the objective function
def objective(trial):
    # Suggest hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    # Create and evaluate the model
    model = XGBClassifier(**params)  # Pass params using unpacking
    score = cross_val_score(model, X_train_scaled, y_train_resampled, cv=3, scoring='accuracy').mean()
    return score

# Create an Optuna study
study = optuna.create_study(direction='maximize')  # Maximize accuracy
study.optimize(objective, n_trials=20, timeout=300)  # Run for 50 trials or 5 minutes

# Display best hyperparameters and score
print("Best Hyperparameters:", study.best_params)
print("Best Cross-Validation Score:", study.best_value)

# Train the model with the best hyperparameters
best_model = XGBClassifier(**study.best_params, random_state=42)
best_model.fit(X_train_scaled, y_train_resampled)

# Test the model
y_pred_test = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred_test)

# Print results
print("\nTest Accuracy:", test_accuracy)
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))
'''

PROVIDING CLASS WEIGHTS WITHOUT SMOTE

In [None]:
'''def objective(trial):
    # Suggest hyperparameters for the Random Forest model
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Tune class weights
    class_weight = trial.suggest_categorical(
        'class_weight',
        ['balanced', 'balanced_subsample']  # Example custom weights
    )

    # Create the Random Forest model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        class_weight=class_weight,
        random_state=42
    )

    # Evaluate using cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

# Create a study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, timeout=300)  # Run 10 trials or 5 minutes

# Print the best hyperparameters
print("Best Hyperparameters:", study.best_params)

# Train the best model on the entire training set
best_params = study.best_params
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_test = best_model.predict(X_test)

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred_test)
print("\nTest Accuracy:", test_accuracy)
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))

'''

In [None]:
y_test.value_counts()

for xgboost , we can use scale_pos_weight and sample weight for managing imbalance

scalepos_weight is for binary classification
I have used sample weight here  

we usually do not tune sample weight , we only tune scale_pos_weight to provide more importance to positive class

sample weight =  totalsample / number of samples in each class i



In [None]:

# Calculate class weights
class_counts = np.bincount(y_train)  # Count samples per class
total_samples = len(y_train)
class_weights = {i: total_samples / class_counts[i] for i in range(len(class_counts))}

# Assign sample weights
sample_weights = np.array([class_weights[label] for label in y_train])


# Define the objective function
def objective(trial):
    # Suggest hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
    }

    # Create and evaluate the model
    model = XGBClassifier(**params)  # Pass params using unpacking
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy' ,params={'sample_weight': sample_weights}).mean()
    return score                                                                # handle same class imbalance in cv too ^

# Create an Optuna study
study = optuna.create_study(direction='maximize')  # Maximize accuracy
study.optimize(objective, n_trials=20, timeout=300)  # Run for 50 trials or 5 minutes

# Display best hyperparameters and score
print("Best Hyperparameters:", study.best_params)
print("Best Cross-Validation Score:", study.best_value)

# Train the model with the best hyperparameters
best_model_XGB = XGBClassifier(**study.best_params, random_state=42)
best_model_XGB.fit(X_train, y_train ,sample_weight=sample_weights)

# Test the model
y_pred_test = best_model_XGB.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)

# Print results
print("\nTest Accuracy:", test_accuracy)
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_pred_test))


# **EXPLAINABLE AI**

[currenly working on this , I'll update this soon !]

In [None]:
feature_importance = best_model_XGB.feature_importances_

# Get feature names directly from X_train
feature_names = X_train.columns  # X_train is assumed to be a pandas DataFrame

# Sort features by importance
indices = np.argsort(feature_importance)[::-1]  # Sort in descending order

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.bar(range(len(feature_importance)), feature_importance[indices], align="center", color="skyblue")
plt.xticks(range(len(feature_importance)), [feature_names[i] for i in indices], rotation=90)
plt.xlabel("Feature Names")
plt.ylabel("Importance Score")
plt.tight_layout()  # Adjust layout to fit feature names properly
plt.show()
