### Importing the necessary Libraries

In [None]:
#Data Structures
import pandas as pd
import numpy as np
import re
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### For installing missingno library, type this command in terminal
#pip install missingno

import missingno as msno
from scipy import stats

#Sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import accuracy_score, classification_report

#Plotting
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

#Others
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

### Loading the data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv("/kaggle/input/telecom-churn-case-study-hackathon-c-68/train.csv")
test_df = pd.read_csv("/kaggle/input/telecom-churn-case-study-hackathon-c-68/test.csv")
sample_output = pd.read_csv("/kaggle/input/telecom-churn-case-study-hackathon-c-68/sample.csv")
data_dict = pd.read_csv("/kaggle/input/telecom-churn-case-study-hackathon-c-68/data_dictionary.csv")

print('The shape of the train dataset is: ', train_df.shape)
print('The shape of the test dataset is: ', test_df.shape)
print('The shape of the sample output is: ', sample_output.shape)
print('The shape of the data dictionary is: ', data_dict.shape)

### Loading the data dictionary for understanding the data at an uber level

In [None]:
data_dict

### Separating the dependent and independent variable

#### Doing a basic df.head() to do an overview of the dataframes

In [None]:
train_df.head(1)

In [None]:
test_df.head(1)

#### Since, we have to submit the model scores on test set, therefore, we need to create the validation set in order to understand how good the model is.

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df.drop('churn_probability', axis = 1),train_df['churn_probability'] , 
                                   random_state=104,  
                                   test_size=0.10,  
                                   shuffle=True) 

id_train = X_train[['id']]
id_val = X_val[['id']]

X_train.drop('id', axis = 1, inplace = True)
X_val.drop('id', axis = 1, inplace = True)


print('Shape of X_train is: ', X_train.shape)
print('Shape of y_train is: ', y_train.shape)
print('Shape of X_val is: ', X_val.shape)
print('Shape of y_val is: ', y_val.shape)

#Changing the dataset name for maintaining the consistency of format throughout
X_test = test_df.copy()

#### EDA and basic feature selection performed on the train dataset only

In [None]:
class DataFrameAnalyzer:
    def __init__(self, df):
        """Initialize with a DataFrame."""
        self.df = df
    
    def basics(self):
        """Prints the DataFrame description."""
        print("\nDataFrame Description:")
        print(self.df.shape)
    
    def percentage_nulls(self):
        """Prints the percentage of nulls per column."""
        print("\nPercentage of Null Values Per Column:")
        print((self.df.isnull().sum() * 100)/self.df.shape[0])
    
    def single_value_columns(self):
        """Prints columns that have the same single value throughout."""
        print("\nColumns with a Single Unique Value:")
        single_value_cols = [col for col in self.df.columns if self.df[col].nunique() == 1]
        print(single_value_cols if single_value_cols else "None")

    def drop_high_null_columns(self, threshold):
        """Drops columns where the percentage of null values is higher than the given threshold."""
        high_null_cols = [col for col in self.df.columns if ((self.df[col].isnull().sum() * 100)/self.df.shape[0]) > threshold]
        self.df.drop(columns=high_null_cols, inplace=True)
        print("\nDropped Columns with High Null Values:", high_null_cols)
    
    def drop_single_value_columns(self):
        """Drops columns that contain only a single unique value throughout."""
        single_value_cols = [col for col in self.df.columns if self.df[col].nunique() == 1]
        self.df.drop(columns=single_value_cols, inplace=True)
        print("\nDropped Columns with Single Unique Value:", single_value_cols)

initial_analyzer = DataFrameAnalyzer(X_train)
initial_analyzer.basics()
initial_analyzer.percentage_nulls()
initial_analyzer.single_value_columns()

#### Immediately on the basis of preliminary EDA, we can remove the columns with greater than 70% null values as they will not provide enough information to ML models, additionally, we will also be removing the columns which have single value throughout.

#### However, need to check if there are some categorical columns, then we do not need to delete and can do fillna with 0

In [None]:
# List of columns for check if they are categorical and need not require deletion even if nulls > 70 %
cat_check = ['night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7', 'fb_user_8']
for col in cat_check:
    print(col, train_df[col].value_counts())
print('--'*60)
# If a lot of the values are unique and then we have 74% null values, we can delete them
unique_check = ['date_of_last_rech_data_6', 'date_of_last_rech_data_7', 'date_of_last_rech_data_8', 'total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'arpu_3g_6', 'arpu_3g_7', 'arpu_3g_8', 'arpu_2g_6', 'arpu_2g_7', 'arpu_2g_8']
for col in unique_check:
    print(col, train_df[col].nunique())

In [None]:
# In order to not drop these columns in the below method call, already replacing the columns null values by 0.
# Later these features can be leveraged for feature engineering

X_train[['night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7', 'fb_user_8']] = X_train[['night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7', 'fb_user_8']].fillna(0, axis = 1)

In [None]:
initial_analyzer.drop_high_null_columns(70)
initial_analyzer.drop_single_value_columns()

In [None]:
# Checking if we have removed the required columns
print(X_train.shape)
X_train.head()

In [None]:
class CorrelationAnalyzer:
    def __init__(self, df):
        """Initialize with a DataFrame."""
        self.df = df
    
    def get_correlation_dataframe(self):
        """Returns a DataFrame with correlations between numerical features."""
        corr_matrix = self.df.corr().stack().reset_index()
        corr_matrix.columns = ['Variable1', 'Variable2', 'Correlation']
        corr_matrix = corr_matrix[corr_matrix['Variable1'] != corr_matrix['Variable2']]
        return corr_matrix

corr_df = CorrelationAnalyzer(X_train.drop(['date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8'], axis = 1))
corr_df = corr_df.get_correlation_dataframe()

In [None]:
corr_df = corr_df.sort_values(by = 'Correlation', ascending = False)
corr_df = corr_df.iloc[1::2]
corr_df

#### As there are a lot of columns, planning to drop correlated variables upto an extent of 0.80. We can tweak this as per result

In [None]:
variable1_80 = corr_df[corr_df['Correlation']>.8]['Variable1'].to_list()
variable2_80 = corr_df[corr_df['Correlation']>.8]['Variable2'].to_list()

# Checking if there are any common that needs to be kept
set(variable1_80).intersection(set(variable2_80))

##### We can now drop all variables from the second list as we will still retain information from the first list

In [None]:
X_train.drop(variable2_80, inplace = True, axis = 1)

In [None]:
print(X_train.shape)
X_train.head(2)

### Indepth EDA and analysis to understand variables

In [None]:
def plot_numeric_distributions(df, target_column = 'churn_probability'):
    """
    Plots the distribution of numerical columns across the categorical target column (0 and 1).
    
    Parameters:
    df (pd.DataFrame): Input dataframe
    target_column (str): The categorical column with 0 and 1 values
    """
    numeric_cols = df.drop(['date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8'], axis = 1).columns

    df = pd.concat([df, pd.DataFrame(y_train)], axis = 1)
    
    for col in numeric_cols:
        plt.figure(figsize=(8, 5))
        sns.histplot(data=df, x=col, hue=target_column, kde=True, bins=30, alpha=0.5)
        plt.title(f"Distribution of {col} by {target_column}")
        plt.legend(title=target_column, labels=[0, 1])
        plt.show()

plot_numeric_distributions(X_train)

#### At the first go with the graphs, we see that most of the remaining variables do show some type of pattern, therefore, rather than feature selection, we can also approach via PCA

### Outlier Treatment

Before going with PCA, as there are a lot of columns it is better to see which columns have how many outliers and further remove reduce or impute them. Hence doing outlier treatment well beforehand

In [None]:
def detect_outliers_boxplot(data):
    outliers = {}
    for col in data.columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers[col] = data[(data[col] < lower_bound) | (data[col] > upper_bound)].shape[0]
    return outliers

# Function to detect outliers based on Z-score criteria
def detect_outliers_zscore(data, threshold=3):
    outliers = {}
    for col in data.columns:
        z_scores = np.abs(stats.zscore(data[col]))
        outliers[col] = len(np.where(z_scores > threshold)[0])
    return outliers

# Function for outlier treatment: You can either remove or cap the outliers
def treat_outliers(data, method='remove', criteria='boxplot', threshold=3):
    if criteria == 'boxplot':
        outliers = detect_outliers_boxplot(data)
        for col in outliers:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            if method == 'remove':
                data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
            elif method == 'cap':
                data[col] = np.where(data[col] > upper_bound, upper_bound, data[col])
                data[col] = np.where(data[col] < lower_bound, lower_bound, data[col])

    elif criteria == 'zscore':
        for col in data.columns:
            z_scores = np.abs(stats.zscore(data[col]))
            if method == 'remove':
                data = data[z_scores < threshold]
            elif method == 'cap':
                data[col] = np.where(z_scores > threshold, np.sign(data[col]) * threshold * data[col].std(), data[col])
    
    return data

In [None]:
# Detect outliers using both methods
boxplot_outliers = detect_outliers_boxplot(X_train.drop(['date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8'], axis = 1))
zscore_outliers = detect_outliers_zscore(X_train.drop(['date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8'], axis = 1))

# Print outliers count by column
print("Outliers by Boxplot Criteria:", boxplot_outliers)
print('--'*60)
print("Outliers by Z-score Criteria:", zscore_outliers)

In [None]:
print(np.mean(list(boxplot_outliers.values())))
print(np.mean(list(zscore_outliers.values())))

### As of now, it seems we might not loose a lot of rows if we are removing the outliers via zscore method. We will follow this process and remove the rows. We will also check the final percentage of rows we lost.

In [None]:
outlier_trt_rqd = ['arpu_7', 'total_rech_num_6', 'total_rech_num_7', 'total_rech_num_8', 'total_rech_amt_6', 'total_rech_amt_8'
                   , 'max_rech_amt_6', 'max_rech_amt_7', 'max_rech_amt_8', 'last_day_rch_amt_6', 'last_day_rch_amt_7'
                   , 'last_day_rch_amt_8', 'vol_2g_mb_6', 'vol_2g_mb_7', 'vol_2g_mb_8', 'vol_3g_mb_6', 'vol_3g_mb_7'
                   , 'vol_3g_mb_8', 'night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8', 'monthly_2g_6', 'monthly_2g_7'
                   , 'monthly_2g_8', 'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8', 'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8'
                   , 'sachet_3g_6', 'sachet_3g_7', 'sachet_3g_8', 'aon', 'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g']

In [None]:
print(X_train.shape)
df_cleaned = treat_outliers(X_train[outlier_trt_rqd], method='remove', criteria='boxplot')
df_cleaned2 = treat_outliers(X_train[outlier_trt_rqd], method='remove', criteria='zscore')
print(df_cleaned.shape, df_cleaned2.shape)

### We will lose a lot of data if we are removing outliers, therefore we will cap the outliers

In [None]:
X_train_cleaned = X_train.copy()
X_train_cleaned[outlier_trt_rqd] = treat_outliers(X_train[outlier_trt_rqd], method='cap', criteria='zscore')
print(X_train.shape, X_train_cleaned.shape)

#### Imputing the rest of the columns with null values by median, this will automatically now take care of categorical variables as well

In [None]:
# Dropping three date columns for the time being
X_train_cleaned = X_train_cleaned.drop(['date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8'], axis = 1)

In [None]:
imputer = SimpleImputer(strategy='median') 
X_train_imputed = imputer.fit_transform(X_train_cleaned)
X_train_imputed = pd.DataFrame(X_train_imputed)
X_train_imputed.columns = X_train_cleaned.columns

### Performing PCA to 104 existing columns to reduce the feature list for easy computation while also retaining the variance/ information from the data

Defining a function in order to see how many KPI's need to be kept

In [None]:
def apply_pca(df, variance_threshold=0.95):
    """
    Applies PCA on a numerical dataframe after scaling and determines the number of components 
    needed to retain the given variance threshold.

    Parameters:
    df (pd.DataFrame): Input dataframe (should only contain numerical columns).
    variance_threshold (float): The cumulative variance threshold to retain features (default=95%).

    Returns:
    pca_df (pd.DataFrame): Transformed dataframe with reduced dimensions.
    pca_final (PCA object): Fitted PCA model for further transformations.
    scaler (StandardScaler object): Fitted scaler for future transformations.
    """
    # Standardizing the features (PCA works better with standardized data)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)

    # Apply PCA
    pca = PCA()
    pca.fit(scaled_data)

    # Calculate explained variance ratio
    explained_variance = np.cumsum(pca.explained_variance_ratio_)

    # Find the number of components to keep based on variance threshold
    num_components = np.argmax(explained_variance >= variance_threshold) + 1
    print(f"Optimal number of components to retain {variance_threshold*100}% variance: {num_components}")

    # Plot explained variance
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
    plt.axhline(y=variance_threshold, color='r', linestyle='--', label=f'{variance_threshold*100}% Variance')
    plt.xlabel("Number of Principal Components")
    plt.ylabel("Cumulative Explained Variance")
    plt.title("Explained Variance vs. Number of Components")
    plt.legend()
    plt.grid(True)
    plt.show()

    # Apply PCA with optimal number of components
    pca_final = PCA(n_components=num_components)
    pca_data = pca_final.fit_transform(scaled_data)

    # Convert to DataFrame
    pca_df = pd.DataFrame(pca_data, columns=[f"PC{i+1}" for i in range(num_components)])
    
    return pca_df, pca_final, scaler 

In [None]:
X_train_pca, pca_final, scaler = apply_pca(X_train_imputed, variance_threshold=0.95)

#### Even after performing PCA, it boils down to around 69 variables for 95% variance, and 37 for 80 percent variance, therefore, process would be to, first do a basic RandomForest Model on 101 features, then we can get feature importances to capture most relevant/ top 80 features and then we can do PCA

### Base Model -  to check how the results are

#### It is a one time thing, but by getting important variables we will anyway reduce non important features and this is for one time only, no reason for keeping all variables and then having 95% variance if all are not important to us.

The following snippet is not for testing a model, but to just understand what features come as most important

In [None]:
rf_base = RandomForestClassifier(n_estimators=100, random_state=42)
rf_base.fit(X_train_imputed, y_train)
y_pred_train = rf_base.predict(X_train_imputed)

accuracy = accuracy_score(y_train, y_pred_train)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_train, y_pred_train))

In [None]:
feature_importance_df = pd.DataFrame({
    'Feature': X_train_imputed.columns,
    'Importance': rf_base.feature_importances_
})

feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

top_50_features = feature_importance_df['Feature'].head(50).tolist()
top_50_features

### Creating the final dataset for modeling activity

In [None]:
X_train_pre_final = X_train_imputed[top_50_features]
print(X_train_pre_final.shape)

In [None]:
X_train_final, pca_final, scaler = apply_pca(X_train_pre_final, variance_threshold=0.90)

### Keeping at 90 % variance stored as we noticed overfitting in the base model check to filter important features
### Selected only top 50 variables from RF model as important because it was clearly overfitting

In [None]:
X_train_final.head()

#### Modeling -
#### Now, we will run iterations of modeling activity to check how the models are performing on train and validation set
#### The best model will be selected for further hyperparameter tuning

#### Preparing the validation and the test set to be in the same format

In [None]:
X_test = X_test[X_train_imputed.columns]
X_val = X_val[X_train_imputed.columns]

print(X_val.shape)
print(X_test.shape)
X_val.head(2)

#### Impute the variables with the training dataset values

#### Steps on X_val

In [None]:
X_val[['night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7', 'fb_user_8']] = X_val[['night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7', 'fb_user_8']].fillna(0, axis = 1)




X_val_imputed = imputer.transform(X_val)
X_val_imputed = pd.DataFrame(X_val_imputed)
X_val_imputed.columns = X_val.columns

X_val_imputed = X_val[top_50_features]
X_val_imputed.fillna(0, inplace = True, axis = 1)
X_val_scaled = scaler.transform(X_val_imputed)

X_val_final = pca_final.transform(X_val_scaled)
X_val_final = pd.DataFrame(X_val_final, columns=[f"PC{i+1}" for i in range(26)])
print(X_val_final.shape)
X_val_final.head()

#### Steps on X_test

In [None]:
X_test[['night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7', 'fb_user_8']] = X_test[['night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7', 'fb_user_8']].fillna(0, axis = 1)


X_test_imputed = imputer.transform(X_test)
X_test_imputed = pd.DataFrame(X_test_imputed)
X_test_imputed.columns = X_test.columns

X_test_imputed = X_test[top_50_features]
X_test_imputed.fillna(0, inplace = True, axis = 1)
X_test_scaled = scaler.transform(X_test_imputed)

X_test_final = pca_final.transform(X_test_scaled)
X_test_final = pd.DataFrame(X_test_final, columns=[f"PC{i+1}" for i in range(26)])
print(X_test_final.shape)
X_test_final.head()

### Running through a loop of Classifier models to finalize the best model before going ahead with hyperparameter tuning

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
def evaluate_models(models, X_train, y_train, X_test, y_test, cv=5, scoring="accuracy"):
    """
    Evaluates multiple models using cross-validation on the train set and reports test set scores.

    Parameters:
    models (dict): Dictionary of models with model names as keys and instances as values.
    X_train (pd.DataFrame or np.array): Training feature set.
    y_train (pd.Series or np.array): Training labels.
    X_test (pd.DataFrame or np.array): Test feature set.
    y_test (pd.Series or np.array): Test labels.
    cv (int): Number of cross-validation folds (default=5).
    scoring (str): Scoring metric for cross-validation (default="accuracy").

    Prints:
    - Model name
    - Cross-validation mean score
    - Test set score
    """
    for name, model in models.items():
        # Perform cross-validation on training data
        cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring)
        
        # Train model on full training set
        model.fit(X_train, y_train)
        
        # Evaluate on test set
        test_score = model.score(X_test, y_test)  # Default method
        # Alternatively, if using accuracy, use:
        # test_score = accuracy_score(y_test, model.predict(X_test))

        print(f"Model: {name}")
        print(f"  - Cross-Validation Score (Mean): {cv_scores.mean():.4f}")
        print(f"  - Test Set Score: {test_score:.4f}")
        print("-" * 50)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier()
}

# Call the function with training & test data
evaluate_models(models, X_train_final, y_train, X_val_final, y_val, cv=5, scoring="accuracy")

### Looking at basic results from multiple models, XGBoost seems to be comparatively giving better results. Hence proceeding with XGBoost for further Hyperparameter tuning

In [None]:
xgb_base = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 300, 500, 700],  # Number of boosting rounds
    'max_depth': [3, 5, 7, 9],  # Tree depth
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
    'subsample': [0.6, 0.8, 1.0],  # Fraction of samples used per tree
    'colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features used per tree
    'gamma': [0, 0.1, 0.2, 0.3],  # Minimum loss reduction required to make further partitions
    'reg_alpha': [0, 0.1, 0.5, 1.0],  # L1 regularization
    'reg_lambda': [0, 0.1, 0.5, 1.0]  # L2 regularization
}

# --------------------------------
# Performing Random Search First to get results faster
# --------------------------------
random_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_grid,
    n_iter=20,  # Number of random combinations to try
    scoring='accuracy',
    cv=5,  # 5-fold Cross-Validation
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit the model with RandomizedSearchCV
random_search.fit(X_train_final, y_train)

# Print best parameters from Randomized Search
print("Best Parameters from Randomized Search:")
print(random_search.best_params_)

# --------------------------------
# Perform Grid Search (Fine-Tuning)
# --------------------------------
# Select best parameters from Randomized Search
best_params = random_search.best_params_

# Define a refined grid using best params
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    param_grid={
        'n_estimators': [best_params['n_estimators'] - 100, best_params['n_estimators'], best_params['n_estimators'] + 100],
        'max_depth': [best_params['max_depth'] - 1, best_params['max_depth'], best_params['max_depth'] + 1],
        'learning_rate': [best_params['learning_rate'] * 0.8, best_params['learning_rate'], best_params['learning_rate'] * 1.2]
    },
    scoring='accuracy',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit the model with GridSearchCV
grid_search.fit(X_train_final, y_train)

# Print best parameters from Grid Search
print("\nBest Parameters from Grid Search:")
print(grid_search.best_params_)

In [None]:
final_model = XGBClassifier(
    use_label_encoder=False, 
    eval_metric="logloss",
    **grid_search.best_params_,
    # scale_pos_weight = 60,
    random_state=42
)

final_model.fit(X_train_final, y_train)
# X_train_final, y_train, X_val_final, y_val

# Predict and Evaluate
y_pred_train = final_model.predict(X_train_final)
accuracy_train = accuracy_score(y_train, y_pred_train)

print(f"\nFinal Model Train Accuracy: {accuracy_train:.4f}")
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

y_pred_val = final_model.predict(X_val_final)
accuracy_val = accuracy_score(y_val, y_pred_val)
print('*'*60)

print(f"\nFinal Model Val Accuracy: {accuracy_val:.4f}")
print(confusion_matrix(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val))

### Using predictions on unseen data

In [None]:
X_test_final.shape, test_df.shape

In [None]:
sample_output.head(2)

In [None]:
y_pred_test = final_model.predict(X_test_final)

test_ids = test_df[['id']]

output = pd.concat([test_ids, pd.DataFrame(y_pred_test)], axis = 1)
output.columns = ['id', 'churn_probability']
output.head()

In [None]:
output.to_csv('priyanujmisra_rahulsrivastava.csv')