In [7]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Load training and testing datasets
train_data = pd.read_csv('/kaggle/input/customer-churn-dataset/customer_churn_dataset-training-master.csv')
test_data = pd.read_csv('/kaggle/input/customer-churn-dataset/customer_churn_dataset-testing-master.csv')

# Specify the important columns
important_columns = ['Age', 'Gender', 'Tenure', 'Usage Frequency', 'Churn']

# View the first few rows of the important columns in training and testing data before preprocessing
print("Training Data - Important Columns (Before Preprocessing):")
print(train_data[important_columns].head())

print("\nTesting Data - Important Columns (Before Preprocessing):")
print(test_data[important_columns].head())

# Preprocessing the data
def preprocess_data(df):
    # Encoding categorical columns
    label_encoders = {}
    categorical_columns = ['Gender', 'Subscription Type', 'Contract Length']
    
    for col in categorical_columns:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col])

    # Drop unnecessary columns
    df = df.drop(columns=['CustomerID'])  # Drop CustomerID since it's not useful for prediction

    # Handle missing values
    df = df.fillna(df.median())  # Fill missing values with median values for numeric columns
    
    return df

# Preprocess training and testing data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Ensure there are no missing values in the target variable (Churn)
train_data = train_data.dropna(subset=['Churn'])
test_data = test_data.dropna(subset=['Churn'])

# View the first few rows of the training and testing data after preprocessing
print("\nTraining Data - After Preprocessing:")
print(train_data[important_columns].head())

print("\nTesting Data - After Preprocessing:")
print(test_data[important_columns].head())

# Splitting features (X) and target variable (y)
X_train = train_data.drop(columns=['Churn'])
y_train = train_data['Churn']

X_test = test_data.drop(columns=['Churn'])
y_test = test_data['Churn']

# Ensure target variable is numeric and has no missing values
y_train = y_train.astype(float)
y_test = y_test.astype(float)

# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'auc',  # AUC-ROC metric
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
#     'n_estimators': 100,
}

# Train the model
bst = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions
y_pred_proba = bst.predict(dtest)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_proba]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_proba)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'AUC Score: {auc_score:.4f}')
print('Classification Report:')
print(classification_rep)


Training Data - Important Columns (Before Preprocessing):
    Age  Gender  Tenure  Usage Frequency  Churn
0  30.0  Female    39.0             14.0    1.0
1  65.0  Female    49.0              1.0    1.0
2  55.0  Female    14.0              4.0    1.0
3  58.0    Male    38.0             21.0    1.0
4  23.0    Male    32.0             20.0    1.0

Testing Data - Important Columns (Before Preprocessing):
   Age  Gender  Tenure  Usage Frequency  Churn
0   22  Female      25               14      1
1   41  Female      28               28      0
2   47    Male      27               10      0
3   35    Male       9               12      0
4   53  Female      58               24      0

Training Data - After Preprocessing:
    Age  Gender  Tenure  Usage Frequency  Churn
0  30.0       0    39.0             14.0    1.0
1  65.0       0    49.0              1.0    1.0
2  55.0       0    14.0              4.0    1.0
3  58.0       1    38.0             21.0    1.0
4  23.0       1    32.0             