In [None]:
# import libraries

# 1. to handle the data
import pandas as pd
import numpy as np

# to visualize the dataset
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

# To preprocess the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
# import iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# machine learning
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
#for classification tasks
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.preprocessing import KBinsDiscretizer
#metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score

# ignore warnings   
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load the data from csv file placed locally in our pc
df = pd.read_csv('./heart_disease_uci.csv')

# print the first 5 rows of the dataframe
df.head()

- **Age Columns**

In [None]:
# exploring the datatype of each column
df.info()

In [None]:
# data shpae
df.shape

In [None]:
# id column
df['id'].min(), df['id'].max()

In [None]:
# age column
df['age'].min(), df['age'].max()

In [None]:
# let's summarie the age column
df['age'].describe()

In [None]:
# draw a histogram to see the distribution of age column
sns.histplot(df['age'], kde=True)

In [None]:
# plot the mean, median and mode of age column using sns
sns.histplot(df['age'], kde=True)
plt.axvline(df['age'].mean(), color='red')
plt.axvline(df['age'].median(), color='green')
plt.axvline(df['age'].mode()[0], color='blue')

# print the value of mean, median and mode of age column
print('Mean:', df['age'].mean())
print('Median:', df['age'].median())
print('Mode:', df['age'].mode()[0])

- **Let's explore the gender based distribution of the dataset for age column.**

In [None]:
# plot the histogram of age column using plotly and coloring this by sex

fig = px.histogram(data_frame=df, x='age', color='sex')
fig.show()

In [None]:
# find the values of sex column
df['sex'].value_counts()

In [None]:
# calculate the percentages of male and female value counts in the data
male_count = 726
female_count = 194
total_count = male_count + female_count

# calculate percentages
male_percentage = (male_count / total_count) * 100
female_percentage = (female_count / total_count) * 100

# display the results
print(f"Male percentage in the data: {male_percentage:.2f}%")
print(f"Female Percentage in the data: {female_percentage:.2f}%")

# difference
difference_percentage = ((male_count - female_count) / female_count) * 100
print(f"Males are {difference_percentage:.2f}% more than females in the data.")

In [None]:
# Plot Same Plot Using Plotly and Coloring this by sex
fig = px.pie(df, names='sex', color='sex')
fig.show()

In [None]:
# find the values count of age column grouping by sex column
df.groupby('sex')['age'].value_counts()

- **Dataset Column**

In [None]:
# Lets deal with dataset column
# find the unique values in dataset column
df['dataset'].unique()

In [None]:
# find unique values count in dataset column
df['dataset'].value_counts()

In [None]:
# plot the countplot of dataset column
# sns.countplot(data=df, x='dataset', hue = 'sex')

# better plots with plotly
fig = px.bar(df, x='dataset', color='sex')
fig.show()

# print the values count of dataset column grouped by sex
print(df.groupby('sex')['dataset'].value_counts())

In [None]:
# make a plot of age column using plotly and coloring this by dataset column
fig = px.histogram(data_frame=df, x='age', color='dataset')
fig.show()

# print the mean median and mode of age column grouped by dataset column
print(f"Mean of Data Set: {df.groupby('dataset')['age'].mean()}")
print("-------------------------------------")
print(f"Median of Data Set: {df.groupby('dataset')['age'].median()}")
print("-------------------------------------")
print(f"Mode of Data Set: {df.groupby('dataset')['age'].agg(pd.Series.mode)}")
print("-------------------------------------")


- **Let's explore cp (Chest Pain) column:** 

 **Types of Chest pain :**

    1. Asymptomatic: No chest pain or discomfort.
    2. Non-Anginal: Chest pain not typical of heart-related issues; requires further investigation.
    3. Atypical Angina: Chest pain with characteristics different from typical heart-related chest pain.
    4. Typical Angina: Classic chest pain indicating potential insufficient blood supply to the heart.

In [None]:
# value count of cp column
df['cp'].value_counts()

In [None]:
cp_num = pd.crosstab(df.cp,df.num)
cp_num

In [None]:
# Add Some Helpful info
pd.crosstab(df.cp,df.num).plot(kind="bar",figsize=(10,6), 
                               color = ['salmon','blue','pink','lightblue','green'])
plt.title('Heart disease frequency for Chest Pain type')
plt.xlabel('Chest Pain type')
plt.ylabel('Amount')
plt.legend(['No Disease', 'Stage 1', 'Stage 2', 'Stage 3', 'Stage 4'])

In [None]:
# draw the plot of age column grouped by cp column using plotly
fig = px.histogram(data_frame=df, x='cp', color='sex')
fig.show()

In [None]:
# count plot of cp column by dataset column
sns.countplot(df, x='cp', hue='dataset')

In [None]:
# draw the plot of age column grouped by cp column using plotly
fig = px.histogram(data_frame=df, x='age', color='cp')
fig.show()

In [None]:
# draw the plot of cp column grouped by num column using plotly
fig = px.histogram(data_frame=df, x='cp', color='num')
fig.show()

- **Let'e explore the trestbps (resting blood pressure) column:**

The normal resting blood pressure is 120/80 mm Hg.

high blood pressure increasing the risk of heart disease and stroke, often asymptomatic, while low blood pressure can lead to dizziness and fainting

In [None]:
# find the value counts of trestbps column
df['trestbps'].describe()

In [None]:
# Plotting Resting Blood Pressure vs Disease
fig = go.Figure()
fig.add_trace(go.Box(y=df['trestbps'].values , name='BP at Rest for all', marker_color = 'green',boxmean=True))
fig.add_trace(go.Box(y=df[df['num']== 0]['trestbps'].values, name ='No Disease', marker_color = 'blue', boxmean = True))
fig.add_trace(go.Box(y=df[df['num'] !=0]['trestbps'].values, name ='Heart Disease', marker_color = 'red', boxmean = True))
fig.update_layout(title = 'BP Distribution (at rest)', yaxis_title = 'Blood Pressure (mm/Hg)', title_x = 0.5)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.show()

In [None]:
# create a histplot of trestbps column
sns.histplot(df['trestbps'], kde=True)

- **Lets Explore the chol Column**

What is the chol :  a fatty substance essential for body function, but elevated levels can contribute to heart disease.

In [None]:
# First we Describe Chol
df['chol'].describe()

In [None]:
# Cross Tab of chol and num
cross = pd.crosstab(df['chol'], df['num']).describe()
cross

In [None]:
sns.boxplot(y=df['chol'], hue=df['num'])

In [None]:
# draw the plot of chol column grouped by num column using plotly
fig = go.Figure()
fig.add_trace(go.Violin(y=df['chol'].values , name='All Patient', marker_color = 'green'))
fig.add_trace(go.Violin(y=df[df['num']== 0]['chol'].values, name ='No Disease', marker_color = 'blue'))
fig.add_trace(go.Violin(y=df[df['num'] == 4]['chol'].values, name ='Heart Disease', marker_color = 'red'))
fig.update_layout(title = 'Cholesterol Level Distribution', yaxis_title = 'Cholesterol Level', title_x = 0.5 )
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.show()

- **Lets Explore Thal ( Thalesmia)**

    Normal: Within expected or healthy parameters.

    Reversible Defect: An abnormality that can potentially be corrected or improved.

    Fixed Defect: An abnormality that is unlikely to change or be corrected.

In [None]:
df['thal'].value_counts()

In [None]:
# Group by thal by sex
df.groupby('thal')['sex'].value_counts()

In [None]:
# Groupby Thal by Dataset
df.groupby('thal')['dataset'].value_counts()

In [None]:
# count plot of cp column by dataset column
sns.countplot(df, x='thal', hue='sex')

In [None]:
# draw the plot of thal column grouped by age column using plotly
fig = px.histogram(data_frame=df, x='age', color='thal')
fig.show()

In [None]:
# draw the plot of thal column grouped by age column using plotly
fig = px.histogram(data_frame=df, x='thal', color='dataset')
fig.show()

In [None]:
# Plot or groupby to check the people who have thal does the have cp 
df.groupby('thal')['cp'].value_counts()
# Plot to Visualize
sns.countplot(df, x='thal', hue='cp')

In [None]:
# Now Check People with Thal Survive or Not 
df.groupby('thal')['num'].value_counts()
# Plot to Visualize
sns.countplot(df, x='thal', hue='num' , palette='viridis')

- **Lets Deal With Num , The Target Variable**
   * `0 = no heart disease`
   * `1 = mild heart disease`
   * `2 = moderate heart disease `
   * `3 = severe heart disease`
   * `4 = critical heart disease `


In [None]:
df['num'].value_counts()

In [None]:
# Groupby num with sex 
df.groupby('num')['sex'].value_counts()
# Plot to Visualize
sns.countplot(df, x='num', hue='sex')

In [None]:
# groupby num by age 
df.groupby('num')['age'].value_counts()
# Plot to Visualize
sns.histplot(df, x='age', hue='num')

In [None]:
# Make Histplot using Plotly 
px.histogram(data_frame=df, x='age', color='num')

In [None]:
df.isnull().sum()[df.isnull().sum() > 0].sort_values(ascending=False)
missing_data_cols = df.isnull().sum()[df.isnull().sum() > 0].index.tolist()
missing_data_cols

In [None]:
df['thal'].value_counts()

In [None]:
categorical_cols = ['thal', 'ca', 'slope', 'exang', 'restecg','fbs', 'cp', 'sex', 'num']
bool_cols = ['fbs', 'exang']
numeric_cols = ['oldpeak', 'thalch', 'chol', 'trestbps', 'age']

In [None]:
# define the function to impute the missing values in thal column

def impute_categorical_missing_data(passed_col):
    
    df_null = df[df[passed_col].isnull()]
    df_not_null = df[df[passed_col].notnull()]

    X = df_not_null.drop(passed_col, axis=1)
    y = df_not_null[passed_col]
    
    other_missing_cols = [col for col in missing_data_cols if col != passed_col]
    
    label_encoder = LabelEncoder()

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])

    if passed_col in bool_cols:
        y = label_encoder.fit_transform(y)
        
    iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=42), add_indicator=True)

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_classifier = RandomForestClassifier()

    rf_classifier.fit(X_train, y_train)

    y_pred = rf_classifier.predict(X_test)

    acc_score = accuracy_score(y_test, y_pred)

    print("The feature '"+ passed_col+ "' has been imputed with", round((acc_score * 100), 2), "accuracy\n")

    X = df_null.drop(passed_col, axis=1)

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
                
    if len(df_null) > 0: 
        df_null[passed_col] = rf_classifier.predict(X)
        if passed_col in bool_cols:
            df_null[passed_col] = df_null[passed_col].map({0: False, 1: True})
        else:
            pass
    else:
        pass

    df_combined = pd.concat([df_not_null, df_null])
    
    return df_combined[passed_col]

def impute_continuous_missing_data(passed_col):
    
    df_null = df[df[passed_col].isnull()]
    df_not_null = df[df[passed_col].notnull()]

    X = df_not_null.drop(passed_col, axis=1)
    y = df_not_null[passed_col]
    
    other_missing_cols = [col for col in missing_data_cols if col != passed_col]
    
    label_encoder = LabelEncoder()

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])
    
    iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=42), add_indicator=True)

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_regressor = RandomForestRegressor()

    rf_regressor.fit(X_train, y_train)

    y_pred = rf_regressor.predict(X_test)

    print("MAE =", mean_absolute_error(y_test, y_pred), "\n")
    print("RMSE =", mean_squared_error(y_test, y_pred, squared=False), "\n")
    print("R2 =", r2_score(y_test, y_pred), "\n")

    X = df_null.drop(passed_col, axis=1)

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass
                
    if len(df_null) > 0: 
        df_null[passed_col] = rf_regressor.predict(X)
    else:
        pass

    df_combined = pd.concat([df_not_null, df_null])
    
    return df_combined[passed_col]

In [None]:
df.isnull().sum()[df.isnull().sum() > 0].sort_values(ascending=False)

In [None]:
# remove warning
import warnings
warnings.filterwarnings('ignore')

# impute missing values using our functions
for col in missing_data_cols:
    print("Missing Values", col, ":", str(round((df[col].isnull().sum() / len(df)) * 100, 2))+"%")
    if col in categorical_cols:
        df[col] = impute_categorical_missing_data(col)
    elif col in numeric_cols:
        df[col] = impute_continuous_missing_data(col)
    else:
        pass

In [None]:
# Again CHecking Missing Values 
df.isnull().sum()

- Here we are Done With Imputing Missing Values , By using Advance Methods Like Random Forest and Iterative Imputer . Which Are More Accurate then using Mean , Median or Mode  We Define a FUnction for Iputing Missing Values , In Which We Passed the Columns Names and The FUnction Return a Dataset With no Missing Values .
- 
      MEthods : 
      1. Random Forest Classifier 
      2. Random Forest Regressor
      3. Iterative Imputer 

In [None]:
# create box plots for all numeric columns using for loop and subplot
plt.figure(figsize=(20, 20))

colors = ['red', 'green', 'blue', 'orange', 'purple']

for i, col in enumerate(numeric_cols):
    plt.subplot(3, 2, i+1)
    sns.boxplot(x=df[col], color=colors[i])
    plt.title(col)
plt.show()

In [None]:
# print the row from df where trestbps value is 0
df[df['trestbps'] == 0]
# remove this row from data
df = df[df['trestbps'] != 0]

In [None]:
# Saving The Cleaned Data 
df.to_csv('heart_disease_cleaned.csv', index=False)

- While Dealing with Outliers , from my Observations There is only One Outlier in the dataset which i removed . Other Values Have some Meaningfull Insight , so we Cannot remove them . Leave them in the Dataset .



 The Target Column is `num` which is the predicted attribute. We will use this column to predict the heart disease. 
 The unique values in this column are: [0, 1].

0 = no heart disease
1 = heart disease

The models that you will use to predict the heart disease. These models should be classifiers for multi-class classification.

1. Random Forest
2. XGB Classifier.</h3>


- Import Libraries

In [None]:
# Import Libraires 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Train test Split 
from sklearn.model_selection import train_test_split
# Models 
from sklearn.naive_bayes import GaussianNB , BernoulliNB , MultinomialNB
from sklearn.tree import DecisionTreeClassifier , DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier ,RandomForestRegressor , AdaBoostRegressor
from xgboost import XGBClassifier , XGBRegressor
from sklearn.linear_model import LinearRegression ,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor , KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor , GradientBoostingClassifier
from sklearn.svm import SVC , SVR
from xgboost import XGBClassifier , XGBRegressor
# Import Naive Bayes
#metrics
from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score , classification_report , accuracy_score , f1_score , precision_score
#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.preprocessing import QuantileTransformer , PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Remove Warning 
import warnings
warnings.filterwarnings('ignore')
# Saving Model 
import pickle

In [None]:
# Load Again The Clean Data 
data = df.copy()
data.head()

In [None]:
# In some of the features, there is space will will create problem later on. 
# So we rename those attributes to handle problems in the future.
data['thal'].replace({'fixed defect':'fixed_defect' , 'reversable defect': 'reversable_defect' }, inplace =True)
data['cp'].replace({'typical angina':'typical_angina', 'atypical angina': 'atypical_angina' }, inplace =True)
data['restecg'].replace({'normal': 'normal' , 'st-t abnormality': 'ST-T_wave_abnormality' , 'lv hypertrophy': 'left_ventricular_hypertrophy' }, inplace =True)

# Genrating New Dataset with Less Columns Which Are Necessary .
data_1 = data[['age','sex','cp','dataset', 'trestbps', 'chol', 'fbs','restecg' , 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal']].copy()
# Some Changes in Target Variable | Only Two Categories (0,1) . 0 for No-Disease , 1 for Disease
data_1['target'] = ((data['num'] > 0)*1).copy()
# Encoding Sex 
data_1['sex'] = (data['sex'] == 'Male')*1
# Encoding Fbs and exang
data_1['fbs'] = (data['fbs'])*1
data_1['exang'] = (data['exang'])*1
# Renaming COlumns Names.
data_1.columns = ['age', 'sex', 'chest_pain_type','country' ,'resting_blood_pressure', 
              'cholesterol', 'fasting_blood_sugar','Restecg',
              'max_heart_rate_achieved', 'exercise_induced_angina', 
              'st_depression', 'st_slope_type', 'num_major_vessels', 
              'thalassemia_type', 'target']
# Load Data Sample 
data_1.head()

Random Forest is an ensemble learning technique used for both classification and regression tasks. It builds multiple decision trees during training and merges their predictions to improve accuracy and reduce overfitting.

1. High Accuracy
2. Robust to Overfitting
3. Handles Missing Values

Random Forest is a versatile and powerful algorithm, especially effective in scenarios with high-dimensional data and complex relationships. It excels in situations where high accuracy is crucial, and its ability to handle missing values and resist overfitting makes it a popular choice in machine learning applications.</h3>


In [None]:
def train_random_forest(data, target):
    # Dictionary to store LabelEncoders for each categorical column
    label_encoders = {}

    # split the data into X and y
    X = data.drop(target, axis=1)
    y = data[target]

    # Create a new LabelEncoder for each categorical column
    for col in X.select_dtypes(include=['object', 'category']).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

    # split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    # Scaling Data
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Define the Random Forest model
    rf_model = RandomForestClassifier(random_state=0,class_weight='balanced')

    # Define hyperparameters for tuning
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model and parameters
    best_rf_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Print the best hyperparameters
    print('Best Hyperparameters:')
    print(best_params)

    # Train the model on the full training set
    best_rf_model.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = best_rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f'Accuracy on Test Set: {accuracy:.2f}')

    # Inverse transform at the End
    # Loop through each column to decode the data
    for col, le in label_encoders.items():
        # Use the inverse_transform method to decode the column in both training and test sets
        X[col] = le.inverse_transform(X[col])

    return best_rf_model, best_params, accuracy


In [None]:
train_random_forest(data_1, 'target')

In [None]:
def train_xgb_classifier(data, target):
    # split the data into X and y
    X = data.drop(target, axis=1)
    y = data[target]
    
    # encode X data using separate label encoder for all categorical columns and save it for inverse transform
    # Task: Separate Encoder for all cat and object columns and inverse transform at the end

    label_encoder = LabelEncoder()

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype == 'category':
            X[col] = label_encoder.fit_transform(X[col])
        else:
            pass

    # split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    
    # Scaling Data 
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Define the XGBClassifier model
    xgb_model = XGBClassifier(random_state=0)

    # Define hyperparameters for tuning
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 1, 2]
    }

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model and parameters
    best_xgb_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Print the best hyperparameters
    print('Best Hyperparameters:')
    print(best_params)

    # Train the model on the full training set
    best_xgb_model.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = best_xgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy on Test Set: {accuracy:.2f}')


    return best_xgb_model, best_params

In [None]:
train_xgb_classifier(data_1,'target')