In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/techparva3-datathon/sample_submission.csv
/kaggle/input/techparva3-datathon/train.csv
/kaggle/input/techparva3-datathon/test.csv


In [None]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing and Feature Engineering
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.impute import KNNImputer
from category_encoders import TargetEncoder
from imblearn.combine import SMOTEENN

# Feature Selection
from sklearn.feature_selection import RFECV

# Model Selection
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# Evaluation Metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score

# Suppress Warnings
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load and Inspect Data
# Replace the path with your actual data path
data = pd.read_csv('/kaggle/input/techparva3-datathon/train.csv')

# Display basic information
print("Initial Data Shape:", data.shape)
print("Data Columns:", data.columns.tolist())
print("\nMissing Values:\n", data.isnull().sum())

# Step 2: Data Cleaning and Preprocessing
def clean_and_preprocess_data(df):
    # 2.1 Drop Unnecessary Columns
    columns_to_drop = [
        'application_type',  # Assuming only 'INDIVIDUAL' is present
        'emp_title',         # High-cardinality; drop or process if meaningful
        'member_id',         # Unique identifier; drop to prevent data leakage
        'issue_date',        # Date; can extract features if needed
        'last_credit_pull_date',
        'last_payment_date',
        'next_payment_date'
    ]
    df = df.drop(columns=columns_to_drop, errors='ignore')
    
    # 2.2 Convert 'emp_length' to Numeric Before Imputation
    def convert_emp_length(emp_length):
        if isinstance(emp_length, str):
            if '< 1 year' in emp_length:
                return 0.0
            elif '10+ years' in emp_length:
                return 10.0
            else:
                try:
                    return float(emp_length.split(' ')[0])
                except ValueError:
                    return np.nan
        return np.nan
    
    if 'emp_length' in df.columns:
        df['emp_length_num'] = df['emp_length'].apply(convert_emp_length)
        df = df.drop(columns=['emp_length'], errors='ignore')
    
    # 2.3 Handle Missing Values with KNN Imputer
    imputer = KNNImputer(n_neighbors=5)
    numerical_cols = [
        'annual_income', 'dti', 'installment', 'int_rate', 
        'loan_amount', 'total_acc', 'total_payment', 'emp_length_num'
    ]
    
    # Check if 'emp_length_num' exists before imputation
    existing_numerical_cols = [col for col in numerical_cols if col in df.columns]
    
    df[existing_numerical_cols] = imputer.fit_transform(df[existing_numerical_cols])
    
    # 2.4 Remove Duplicates
    initial_shape = df.shape
    df = df.drop_duplicates()
    final_shape = df.shape
    print(f"Removed {initial_shape[0] - final_shape[0]} duplicate rows.")
    
    # 2.5 Handle Outliers Using the IQR Method
    for col in existing_numerical_cols:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            initial_shape = df.shape
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
            final_shape = df.shape
            print(f"Removed {initial_shape[0] - final_shape[0]} outliers from '{col}'.")
    
    # 2.6 Feature Engineering: Interest-to-Loan Ratio
    if 'int_rate' in df.columns and 'loan_amount' in df.columns:
        df['interest_to_loan_ratio'] = df['int_rate'] / (df['loan_amount'] + 1e-5)  # Avoid division by zero
    
    return df

# Apply Cleaning and Preprocessing
data = clean_and_preprocess_data(data)
print("\nData Shape after Cleaning:", data.shape)
print("\nMissing Values after Imputation:\n", data.isnull().sum())

# Step 3: Encode Target Variable
# Assuming 'grade' is the target variable
label_encoder = LabelEncoder()
data['grade_encoded'] = label_encoder.fit_transform(data['grade'])

# Step 4: Define Features (X) and Target (y)
X = data.drop(columns=['grade', 'grade_encoded'], errors='ignore')
y = data['grade_encoded']

# Step 5: Advanced Encoding for Categorical Variables
categorical_cols = ['address_state', 'home_ownership', 'loan_status', 'purpose', 'term', 'verification_status']
# Note: 'loan_status' can have categories like 'Current', 'Charged Off', etc.

# Initialize Target Encoder
target_encoder = TargetEncoder(cols=categorical_cols)
X[categorical_cols] = target_encoder.fit_transform(X[categorical_cols], y)

# Step 6: Feature Selection - Remove Highly Correlated Features
def remove_correlated_features(df, threshold=0.9):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    df = df.drop(columns=to_drop)
    return df, to_drop

X, dropped_features = remove_correlated_features(X, threshold=0.9)
print(f"\nDropped {len(dropped_features)} highly correlated features: {dropped_features}")

# Step 7: Create Interaction Features (Optional)
# Example: Creating a new feature by multiplying 'annual_income' and 'dti'
if 'annual_income' in X.columns and 'dti' in X.columns:
    X['income_dti'] = X['annual_income'] * X['dti']

# Step 8: Split the Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTraining Samples: {X_train.shape[0]}, Testing Samples: {X_test.shape[0]}")

# Step 9: Handle Class Imbalance with SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)
print(f"Original Training Samples: {X_train.shape[0]}, Resampled Training Samples: {X_train_resampled.shape[0]}")

# Step 10: Feature Scaling with RobustScaler
scaler = RobustScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Step 11: Feature Selection with RFECV (Recursive Feature Elimination with Cross-Validation)
rf_estimator = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rfecv = RFECV(
    estimator=rf_estimator,
    step=10,
    cv=StratifiedKFold(5),
    scoring='accuracy',
    n_jobs=-1
)
rfecv.fit(X_train_resampled, y_train_resampled)
print(f"\nOptimal number of features: {rfecv.n_features_}")

# Select Features
X_train_selected = rfecv.transform(X_train_resampled)
X_test_selected = rfecv.transform(X_test)

# Step 12: Model Training with XGBoost and Hyperparameter Tuning
xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=len(np.unique(y)),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

# Define Hyperparameter Grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
print("\nStarting Grid Search for XGBoost...")
grid_search.fit(X_train_selected, y_train_resampled)

# Best Estimator
best_xgb = grid_search.best_estimator_
print(f"\nBest XGBoost Parameters: {grid_search.best_params_}")

# Step 13: Evaluate the Best Model
y_pred = best_xgb.predict(X_test_selected)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"\nXGBoost Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - XGBoost')
plt.show()

# Step 14: Cross-Validation for Robust Evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_xgb, X_train_selected, y_train_resampled, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Step 15: Feature Importance Visualization
importances = best_xgb.feature_importances_
# Retrieve feature names from RFECV
if hasattr(rfecv, 'get_feature_names_out'):
    feature_names = rfecv.get_feature_names_out()
else:
    # If not available, create generic feature names
    feature_names = [f'Feature {i}' for i in range(importances.shape[0])]

feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20))
plt.title('Top 20 Feature Importances - XGBoost')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

# Optional: Ensemble with Voting Classifier
# Initialize Random Forest for Ensemble
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)

# Initialize Voting Classifier with XGBoost and Random Forest
voting_clf = VotingClassifier(
    estimators=[('xgb', best_xgb), ('rf', rf_model)],
    voting='soft',
    n_jobs=-1
)

# Fit Voting Classifier
print("\nTraining Voting Classifier...")
voting_clf.fit(X_train_selected, y_train_resampled)

# Predict with Voting Classifier
y_pred_voting = voting_clf.predict(X_test_selected)

# Evaluate Voting Classifier
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f"\nVoting Classifier Accuracy: {accuracy_voting:.4f}")
print("\nVoting Classifier Classification Report:")
print(classification_report(y_test, y_pred_voting, target_names=label_encoder.classes_))

# Confusion Matrix for Voting Classifier
cm_voting = confusion_matrix(y_test, y_pred_voting)
plt.figure(figsize=(12, 10))
sns.heatmap(cm_voting, annot=True, fmt='d', cmap='Greens', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Voting Classifier')
plt.show()


Initial Data Shape: (75001, 27)
Data Columns: ['address_state', 'application_type', 'emp_length', 'emp_title', 'grade', 'home_ownership', 'issue_date', 'last_credit_pull_date', 'last_payment_date', 'loan_status', 'next_payment_date', 'member_id', 'purpose', 'term', 'verification_status', 'annual_income', 'dti', 'installment', 'int_rate', 'loan_amount', 'total_acc', 'total_payment', 'issue_date_year', 'issue_date_month', 'issue_date_day', 'issue_date_weekday', 'issue_date_hour']

Missing Values:
 address_state                0
application_type             0
emp_length                   0
emp_title                    0
grade                        0
home_ownership               0
issue_date                   0
last_credit_pull_date        0
last_payment_date            0
loan_status                  0
next_payment_date            0
member_id                    0
purpose                      0
term                         0
verification_status          0
annual_income             7500
dti