In [1]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from imblearn.over_sampling import  SMOTE, ADASYN
# from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score, roc_curve,mean_squared_error
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils import resample
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

In [15]:
# Import the training datasets by romoving Column5, Column9, Column14 (these columns have  )
x_train = pd.read_csv('/content/drive/MyDrive/pro_gst/Train_data/X_Train_Data_Input.csv').drop(['Column5','Column9','Column14'],axis=1)
y_train = pd.read_csv('/content/drive/MyDrive/pro_gst/Train_data/Y_Train_Data_Target.csv')

# Import the testing datasets by romoving Column5, Column9, Column14
x_test = pd.read_csv('/content/drive/MyDrive/pro_gst/Test_data/X_Test_Data_Input.csv').drop(['Column5','Column9','Column14'],axis=1)
y_test = pd.read_csv('/content/drive/MyDrive/pro_gst/Test_data/Y_Test_Data_Target.csv')

# Merging xtrain and y_train on the basis of ID
train_df = pd.merge(x_train, y_train, on='ID').drop('ID',axis=1)

In [16]:
# x_train.drop('ID',axis=1, inplace=True)
# y_train.drop('ID',axis=1,inplace=True)
x_test.drop('ID',axis=1, inplace=True)
y_test.drop('ID',axis=1,inplace=True)

In [17]:
# Handling missing values of Column6, Column8, Column15, Column0
train_df['Column6'] = train_df['Column6'].fillna(train_df['Column6'].interpolate())
train_df['Column8'] = train_df['Column8'].fillna(train_df['Column8'].interpolate())
train_df['Column15'] = train_df['Column15'].fillna(train_df['Column15'].interpolate())
# train_df['Column3'] = train_df['Column3'].fillna(train_df['Column3'].interpolate())
# train_df['Column4'] = train_df['Column4'].fillna(train_df['Column4'].interpolate())
train_df['Column0'] = train_df['Column0'].fillna(train_df['Column0'].interpolate())
xtrain = train_df.drop('target',axis=1)
ytrain = train_df['target']

In [18]:
# Ensuring the Nan data filling
train_df.isna().sum()

Unnamed: 0,0
Column0,0
Column1,0
Column2,0
Column3,126303
Column4,127710
Column6,0
Column7,0
Column8,0
Column10,0
Column11,0


In [5]:
# Imputation function here, to clear out the missing values by creating an ML model to predict it.
def custom_imput(df,target_column,algorithm='lr'):
  # Separate samples with missing target values
  df_missing = df[df[target_column].isnull()]
  df_complete = df[df[target_column].notnull()]
  if True in df_complete.isna().any().to_list():
    print("Found Nan")
    print(df_complete.isna().sum())
    return print("Remove Nan from the training dataset")
    # df_complete = df_complete.interpolate()

  # Prepare features and target
  feature_X = df_complete.drop(columns=[target_column])
  feature_y = df_complete[target_column]

  # Split the data
  feature_X_train, feature_X_test, feature_y_train, feature_y_test = train_test_split(feature_X, feature_y, test_size=0.2, random_state=42)

  # Prepare the data with missing target values
  X_missing = df_missing.drop(columns=[target_column])
  X_missing = X_missing.interpolate()

  if algorithm == 'lr':
      feature_model = LinearRegression()
  elif algorithm == 'rf':
      feature_model = RandomForestRegressor()
  elif algorithm == 'knn':
      feature_model = KNeighborsRegressor()
  else:
      raise ValueError("Invalid algorithm specified")

  feature_model.fit(feature_X_train, feature_y_train)
  feature_y_pred = feature_model.predict(feature_X_test)
  performance = mean_squared_error(feature_y_test, feature_y_pred, squared=False)
  print(f"Model RMSE: {performance}")

  # Predict missing values
  missing_pred = feature_model.predict(X_missing)

  # Update the original dataframe with predicted values
  df.loc[df[target_column].isnull(), target_column] = missing_pred

  return df

In [18]:
test_df = train_df.copy()
# test_df.drop(['Column3'], axis=1, inplace= True)
test_df.drop(['target'], axis=1, inplace= True)

In [19]:
# predicting column3 without column4, because it contains missing values.
col3 = custom_imput(test_df.drop('Column4',axis=1),'Column3','lr')['Column3']
test_df['Column3'] = col3
# predicting column4, including column3 in training df
col4 = custom_imput(test_df,'Column4','lr')['Column4']
test_df['Column4'] = col4
test_df['Column3'] = train_df['Column3']
# Again predicting column3 with column4 included in the traninig dataset, because column3 and column4 and highly correlated 
test_df['Column3'] = custom_imput(test_df,'Column3','lr')['Column3']

# # test_df['Column3'] = col3
imputed_df = test_df



Model RMSE: 0.9134149921070405




Model RMSE: 0.44017997531313013
Model RMSE: 0.4650938059075588




In [9]:
def display_class_distribution(y):
    """
    Display the distribution of classes in the target variable.

    Parameters:
    y (array-like): The target variable
    """
    print("Class distribution:")
    print(pd.Series(y).value_counts(normalize=True))

def random_oversampling(X, y):
    """
    Perform random oversampling on the minority class.

    Parameters:
    X (array-like): The feature matrix
    y (array-like): The target variable

    Returns:
    tuple: X_resampled, y_resampled
    """
    X_df = pd.DataFrame(X)
    y_df = pd.Series(y)

    # Separate majority and minority classes
    df_majority = X_df[y_df == y_df.value_counts().index[0]]
    df_minority = X_df[y_df == y_df.value_counts().index[-1]]

    # Upsample minority class
    df_minority_upsampled = resample(df_minority,
                                     replace=True,     # sample with replacement
                                     n_samples=len(df_majority),    # to match majority class
                                     random_state=42) # reproducible results

    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    return df_upsampled.iloc[:, :-1].values, df_upsampled.iloc[:, -1].values

def random_undersampling(X, y):
    """
    Perform random undersampling on the majority class.

    Parameters:
    X (array-like): The feature matrix
    y (array-like): The target variable

    Returns:
    tuple: X_resampled, y_resampled
    """
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_under, y_under = undersample.fit_resample(X, y)
    return X_under, y_under

def smote_oversampling(X, y):
    """
    Perform SMOTE oversampling.

    Parameters:
    X (array-like): The feature matrix
    y (array-like): The target variable

    Returns:
    tuple: X_resampled, y_resampled
    """
    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(X, y)
    X_smote = pd.DataFrame(X_smote, columns=X.columns)
    y_smote = pd.Series(y_smote, name=y.name)
    return X_smote, y_smote

def adasyn_oversampling(X, y):
    """
    Perform ADASYN oversampling.

    Parameters:
    X (array-like): The feature matrix
    y (array-like): The target variable

    Returns:
    tuple: X_resampled, y_resampled
    """
    adasyn = ADASYN()
    X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
    return X_adasyn, y_adasyn

def smote_tomek_sampling(X, y):
    """
    Perform combined over- and under-sampling using SMOTE and Tomek links.

    Parameters:
    X (array-like): The feature matrix
    y (array-like): The target variable

    Returns:
    tuple: X_resampled, y_resampled
    """
    smt = SMOTETomek()
    X_smt, y_smt = smt.fit_resample(X, y)
    return X_smt, y_smt

In [10]:
xtrain = imputed_df
ytrain = train_df['target']
# imputed_df['target'] = ytrain
# df = imputed_df


In [11]:
# Operations / Functions to correct the data imbalance in the dataset
display_class_distribution(ytrain)
#
# # Choose one of the following methods:
# X_resampled, y_resampled = random_oversampling(xtrain, ytrain)
# # OR
# X_resampled, y_resampled = random_undersampling(xtrain, ytrain)
# # OR
X_resampled, y_resampled = smote_oversampling(xtrain, ytrain)
# # OR
# X_resampled, y_resampled = adasyn_oversampling(xtrain, ytrain)
# # OR
# X_resampled, y_resampled = smote_tomek_sampling(xtrain, ytrain)
#
print("Original dataset shape:", xtrain.shape)
print("Resampled dataset shape:", X_resampled.shape)
display_class_distribution(y_resampled)

Class distribution:
target
0    0.905706
1    0.094294
Name: proportion, dtype: float64
Original dataset shape: (785133, 19)
Resampled dataset shape: (1422200, 19)
Class distribution:
target
0    0.5
1    0.5
Name: proportion, dtype: float64


In [12]:
# Using XGBoost Classifier model to train the final dataset
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False
)
model.fit(X_resampled, y_resampled)

Parameters: { "use_label_encoder" } are not used.



In [13]:
# Predicting the target with given test dataset
y_pred = model.predict(x_test)
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Calculate AUC-ROC
auc_roc = roc_auc_score(y_test, y_pred)
print(f"\nAUC-ROC: {auc_roc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.98

Confusion Matrix:
[[231267   5767]
 [   689  23989]]

AUC-ROC: 0.9739

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    237034
           1       0.81      0.97      0.88     24678

    accuracy                           0.98    261712
   macro avg       0.90      0.97      0.93    261712
weighted avg       0.98      0.98      0.98    261712

