In [None]:
from dotenv import load_dotenv
import os
from pathlib import Path

CURRENT_DIRECTORY_NOTEBOOK = None


def intitate_notebook():
    load_dotenv()
    global CURRENT_DIRECTORY_NOTEBOOK
    if CURRENT_DIRECTORY_NOTEBOOK is None:
        os.chdir(os.getenv("PROJECT_BASE_PATH"))
        CURRENT_DIRECTORY_NOTEBOOK = Path(os.getcwd())
        print("Current directory for notebook: ", CURRENT_DIRECTORY_NOTEBOOK)
    else:
        print(
            "Current directory for notebook is already set: ",
            CURRENT_DIRECTORY_NOTEBOOK,
        )

intitate_notebook()

from src.utils.common import clear_gc
clear_gc()

TRAIN_PARQUET_FOLDER_PATH = (
    "raw_main_dataset/home-credit-credit-risk-model-stability/parquet_files/train"
)

CLEANED_DATASET_DIRECTORY_PATH = Path("datasets/cleaned_data")

In [None]:
from src.utils.common import import_parquet_file, count_unique_values
from src.data_preprocessing import process_dates, fill_missing_values, drop_single_unique_columns, check_contains_negative_value, convert_boolean_to_integer, convert_categorical_to_integers
from src.data_cleaning import get_column_description, get_columns_high_missing_values, row_missing_value_analysis
from pyspark.sql import SparkSession
from src.data_cleaning import column_stats
from pyspark.sql.functions import col, lit
from pyspark.sql.types import DateType
from tqdm import tqdm

## Import data

In [None]:
df, spark = import_parquet_file(
    file_path = CURRENT_DIRECTORY_NOTEBOOK / CLEANED_DATASET_DIRECTORY_PATH / Path("combined_clean_data"),
    app_name="Full_Dataset_Base_ML_Model_Training"
)

# train_df = df.filter((df.WEEK_NUM != 9) | (df.WEEK_NUM != 8) | (df.WEEK_NUM != 7))
# test_df = df.filter((df.WEEK_NUM == 9) | (df.WEEK_NUM == 8) | (df.WEEK_NUM == 7))
# train_df.count(), test_df.count()

df.count()

In [None]:
df = df.filter(df.WEEK_NUM>=70)
df.count()

## Process Columns with Dates

In [None]:
date_columns_name = [
    "birth_259D",
    # "dateofcredend_289D_num_group_0",
    # "dateofcredstart_739D_num_group_0",
    # "lastupdate_1112D_num_group_0",
    "creationdate_885D",
    "firstnonzeroinstldate_307D",
    "min_tax_registry_date",
    "max_tax_registry_date"
]

df = process_dates(df=df, date_columns=date_columns_name)


## Drop Columns

In [None]:
features_to_drop = [
    "dateofbirth_337D", # similar to birth_259D, also contains missing values (~ 9%)
    "date_decision", # it is not to feature as it comes from date_decisiion, which comes after labeling or predictions.
    "WEEK_NUM", # Only used for Chunking # it is not to feature as it comes from date_decisiion, which comes after labeling or predictions.
    "base_date", # temporary column
    "case_id", # All values are unique
    "education_1103M", # similar to education_927M
    "education_88M", # similat to education_927M
    "maritalst_893M", # similar to maritalst_385M
    "num_group1", # Not required
    "num_group1_num_group_0", # Not required
    "num_group1_num_group_1", # Not required
    "contaddr_matchlist_1032L" # It contains only False or None
]

df = df.drop(*features_to_drop)
df = drop_single_unique_columns(df)

df.show(5)

In [None]:
# # Temp
# for column_name in df.columns:
#     min_value = df.agg({column_name: 'min'}).collect()[0][0]
#     if type(min_value) is float and min_value < 0:
#         print( f"Column Name: {column_name}\nMin. Value: {min_value}\n\n" )

## Identify Categorical Column

In [None]:
# unique_values_datatype = {}

# for column in tqdm(df.columns):
#     temp_col = df.select(column)
#     col_data_type = temp_col.dtypes[0]
#     if col_data_type[1] not in ["string"]: # ["int", "float", "date", "bigint", "double", "boolean", "string"]:
#         unique_values_datatype[column] = {
#             "unique_values_count": temp_col.distinct().count(),
#             "data_type": col_data_type[1],
#             # "unique_values": [ row[0] for row in temp_col.distinct().collect() if row[0] is not None ]
#         }

In [None]:
# unique_values_datatype_sorted = sorted(unique_values_datatype.items(), key=lambda item: item[1]['unique_values_count'], reverse=False)

# for item in unique_values_datatype_sorted:
#     if len(df.select(item[0]).distinct().collect()):
#         unique_values = [ row[0] for row in df.select(item[0]).distinct().collect() if row[0] is not None ]
#         print("Column Name: ", item[0], "\nNumber of unique values: ", item[1]['unique_values_count'], "\nData type: ", item[1]['data_type'], "\nUnique_values: ", unique_values[:10], "\n", " - "*25, "\n")

In [None]:
categorical_columns_boolean = [
    "contaddr_smempladdr_334L",
    "safeguarantyflag_411L",
    "isbidproduct_390L"
]

categorical_columns_string = [
    'description_5085714M',
    'maritalst_385M',
    'contaddr_district_15M',
    'contaddr_zipcode_807M',
    'education_927M',
    'empladdr_district_926M',
    'empladdr_zipcode_114M',
    'incometype_1044T',
    'language1_981M',
    'registaddr_district_1083M',
    'registaddr_zipcode_184M',
    'role_1084L',
    'sex_738L',
    'type_25L',
    'classificationofcontr_13M_num_group_0',
    'classificationofcontr_400M_num_group_0',
    'contractst_545M_num_group_0',
    'contractst_964M_num_group_0',
    'description_351M_num_group_0',
    'financialinstitution_382M_num_group_0',
    'financialinstitution_591M_num_group_0',
    'purposeofcred_426M_num_group_0',
    'purposeofcred_874M_num_group_0',
    'subjectrole_182M_num_group_0',
    'subjectrole_93M_num_group_0',
    'classificationofcontr_13M_num_group_1',
    'classificationofcontr_400M_num_group_1',
    'contractst_545M_num_group_1',
    'contractst_964M_num_group_1',
    'description_351M_num_group_1',
    'financialinstitution_382M_num_group_1',
    'financialinstitution_591M_num_group_1',
    'purposeofcred_426M_num_group_1',
    'purposeofcred_874M_num_group_1',
    'subjectrole_182M_num_group_1',
    'subjectrole_93M_num_group_1',
    'cancelreason_3545846M',
    'credtype_587L',
    'district_544M',
    'education_1138M',
    'inittransactioncode_279L',
    'postype_4733339M',
    'profession_152M',
    'rejectreason_755M',
    'rejectreasonclient_4145042M',
    'status_219L',
    'tax_registry_provider'
]

In [None]:
# all_unique_values = []

# for key, value in unique_values_datatype.items():
#     if "_num_group_0" in key or "_num_group_1" in key:
#         column_name = key[:-12]
#     else:
#         column_name = key
#     get_column_description(column_name)
    # print("Column Name: ", column_name, "\nColumn Description: ", get_column_description(column_name)) #value['data_type'])

#     # all_unique_values = all_unique_values + value['unique_values']

In [None]:
# from collections import defaultdict

# def find_elements_in_at_least_two_keys(dictionary: dict) -> set:
#     """
#     Find elements that are present in at least two keys of the dictionary.

#     :param dictionary: Dictionary with list values
#     :return: Set of elements that appear in at least two keys
#     """
#     # Create a dictionary to count the number of keys each element appears in
#     element_counts = defaultdict(int)
    
#     # Iterate through each key-value pair in the dictionary
#     for key, values in dictionary.items():
#         # Use a set to keep track of elements in the current key to avoid counting duplicates
#         seen_elements = set()
#         for element in values['unique_values']:
#             if element not in seen_elements:
#                 element_counts[element] += 1
#                 seen_elements.add(element)
    
#     # Find elements that appear in at least two keys
#     result = {element: count for element, count in element_counts.items() if count >= 2}
#     result = sorted(result.items(), key=lambda item: item[1], reverse=True)
    
#     # Create a new dictionary from the sorted items
#     result = dict(result)
    
#     return result

# result = find_elements_in_at_least_two_keys(unique_values_datatype)
# len(result)
# print(result)

## Categorical Encoder

In [None]:
df = convert_boolean_to_integer(
    df = df,
    boolean_cols = categorical_columns_boolean
)

In [None]:
df = convert_categorical_to_integers(
    df = df,
    categorical_cols = categorical_columns_string
)

In [None]:
# missing_values_dict = get_columns_high_missing_values(
#     df = df,
#     threshold=None
# )

In [None]:
# set([ df.schema[col_name].dataType for col_name in df.columns ])

## Fill Missing Values

In [None]:
# column_having_negative_values = check_contains_negative_value(df)
# column_having_negative_values

In [None]:
missing_values_dict = get_columns_high_missing_values(
    df = df,
    threshold=None
)

columns_to_drop = [ key for key, value in missing_values_dict.items() if value>=10] 
df = df.drop(*columns_to_drop)

In [None]:
column_having_negative_values = check_contains_negative_value(df)
column_having_negative_values

In [None]:
df = fill_missing_values(df, fill_value=-1)

In [None]:
df.show(5)
df.count(), len(df.columns)

## Training Base Model

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

In [None]:
# spark_rf = SparkSession.builder.appName("RandomForestBaseModel").getOrCreate()
# spark_rf.stop()

In [None]:
feature_cols = df.columns
target_col = feature_cols.pop(feature_cols.index('target'))
# target_col, feature_cols

In [None]:
# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Initialize RandomForestClassifier
rf = RandomForestClassifier(featuresCol="features", labelCol=target_col)

# Create a pipeline to streamline the process
pipeline = Pipeline(stages=[assembler, rf])

In [None]:
# Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.7, 3]) # , seed=10)

In [None]:
# missing_values_dict = get_columns_high_missing_values(
#     df = train_data,
#     threshold=None
# )

train_data.show(10)

In [None]:
# Train the model
model = pipeline.fit(train_data)

In [None]:
# Make predictions
train_predictions = model.transform(train_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol=target_col, rawPredictionCol="rawPrediction")
auc_roc_score = evaluator.evaluate(
    train_predictions,
    {evaluator.metricName: "areaUnderROC"}
)

print(f"AUC ROC Score: {auc_roc_score}")

In [None]:
# Make predictions
test_predictions = model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol=target_col, rawPredictionCol="rawPrediction")
auc_roc_score = evaluator.evaluate(
    test_predictions,
    {evaluator.metricName: "areaUnderROC"}
)

print(f"AUC ROC Score: {auc_roc_score}")

## XGBoost Model - SparkML

In [None]:
# from xgboost.spark import SparkXGBClassifier

In [None]:
# regressor = SparkXGBClassifier(
#   features_col=feature_cols,
#   label_col=target_col,
#   num_workers=2,
#   device="cuda"
# )

In [None]:
# model = regressor.fit(train_data)

In [None]:
# predict_df = model.transform(test_data)
# predict_df.show()

## XGBoost Model - Sklearn

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from loguru import logger

In [None]:
X_train = train_data.toPandas()
X_test = test_data.toPandas()
X_train.shape, X_test.shape

In [None]:
y_train = X_train[['target']]
y_test = X_test[['target']]
X_train = X_train.drop(columns=['target'])
X_test = X_test.drop(columns=['target'])

In [None]:
# Define and train the XGBoost model
model = xgb.XGBClassifier()
logger.info("Training the XGBoost model.")
model.fit(X_train, y_train)

In [None]:
# Make predictions
logger.info("Making predictions on the test set.")
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)
logger.info(f"Model Accuracy: {accuracy:.4f}")
logger.info(f"AUC-ROC Score: {auc_roc:.4f}")
logger.info("Classification Report:")
logger.info("\n" + classification_report(y_test, y_pred))

In [None]:
# Make predictions
logger.info("Making predictions on the test set.")
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)
logger.info(f"Model Accuracy: {accuracy:.4f}")
logger.info(f"AUC-ROC Score: {auc_roc:.4f}")
logger.info("Classification Report:")
logger.info("\n" + classification_report(y_test, y_pred))

## Random Forest - Sklearn

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier()
logger.info("Random Forest classifier initialized.")

model.fit(X_train, y_train)
logger.info("Model training completed.")

In [None]:
# Make predictions
logger.info("Making predictions on the test set.")
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)
logger.info(f"Model Accuracy: {accuracy:.4f}")
logger.info(f"AUC-ROC Score: {auc_roc:.4f}")
logger.info("Classification Report:")
logger.info("\n" + classification_report(y_test, y_pred))

In [None]:
spark.stop()