In [1]:
pip install dask[dataframe]

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
from dotenv import load_dotenv
import gdown
import requests
import dask.dataframe as dd


# Load environment variables from the .env file
load_dotenv()

# Function to construct Google Drive direct download link
def get_google_drive_url(file_id):
    return f"https://drive.google.com/uc?id={file_id}"

# Get file IDs from the .env file
file_ids = {
    "application_train": os.getenv("APPLICATION_TRAIN_ID"),
    "application_test": os.getenv("APPLICATION_TEST_ID"),
    "bureau": os.getenv("BUREAU_ID"),
    "bureau_balance": os.getenv("BUREAU_BALANCE_ID"),
    "credit_card_balance": os.getenv("CREDIT_CARD_BALANCE_ID"),
    "installments_payments": os.getenv("INSTALLMENTS_PAYMENTS_ID"),
    "previous_application": os.getenv("PREVIOUS_APPLICATION_ID"),
    "POS_CASH_balance": os.getenv("POS_CASH_BALANCE_ID"),
}

# Construct direct download links
google_drive_links = {key: get_google_drive_url(value) for key, value in file_ids.items()}

# Function to download a file using gdown, only if not already downloaded
def download_csv(file_url, output_path):
    if not os.path.exists(output_path):  # Check if file already exists
        print(f"Downloading {output_path}...")
        try:
            gdown.download(file_url, output_path, quiet=False)
            print(f"Downloaded {output_path}")
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {file_url}: {e}")
    else:
        print(f"{output_path} already exists.")

# Define the output file paths
output_paths = {
    "application_train": "application_train.csv",
    "application_test": "application_test.csv",
    "bureau": "bureau.csv",
    "bureau_balance": "bureau_balance.csv",
    "credit_card_balance": "credit_card_balance.csv",
    "installments_payments": "installments_payments.csv",
    "previous_application": "previous_application.csv",
    "POS_CASH_balance": "POS_CASH_balance.csv"
}

# Download the datasets
for key, file_url in google_drive_links.items():
    download_csv(file_url, output_paths[key])

# Load datasets from local files
try:
    app_train = dd.read_csv(output_paths["application_train"], on_bad_lines='skip')
    app_test = dd.read_csv(output_paths["application_test"], on_bad_lines='skip')
    bureau = dd.read_csv(output_paths["bureau"], on_bad_lines='skip')
    bureau_balance = dd.read_csv(output_paths["bureau_balance"], on_bad_lines='skip')
    credit_card_balance = dd.read_csv(output_paths["credit_card_balance"], on_bad_lines='skip')
    installments_payments = dd.read_csv(output_paths["installments_payments"], on_bad_lines='skip')
    previous_application = dd.read_csv(output_paths["previous_application"], on_bad_lines='skip')
    POS_CASH_balance = dd.read_csv(output_paths["POS_CASH_balance"], on_bad_lines='skip')
except Exception as e:
    print(f"Error loading CSV files: {e}")

# Example: Print the first few rows of the application_train dataset
print(app_train.head())


application_train.csv already exists.
application_test.csv already exists.
bureau.csv already exists.
bureau_balance.csv already exists.
credit_card_balance.csv already exists.
installments_payments.csv already exists.
previous_application.csv already exists.
POS_CASH_balance.csv already exists.
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    1

In [3]:
import dask.dataframe as dd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
import pandas as pd
import os

# Function to reduce memory usage
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:  # Exclude string columns
            if pd.api.types.is_integer_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='integer')
            elif pd.api.types.is_float_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='float')
    return df

# Convert Dask DataFrames to Pandas DataFrames
app_train = app_train.compute()
app_test = app_test.compute()
credit_card_balance = credit_card_balance.compute()

# Reduce memory usage
app_train = reduce_memory_usage(app_train)
app_test = reduce_memory_usage(app_test)
credit_card_balance = reduce_memory_usage(credit_card_balance)

# Merge data
columns_to_merge = ['SK_ID_CURR', 'AMT_BALANCE', 'SK_DPD']
credit_card_balance_selected = credit_card_balance[columns_to_merge]

app_train_dd = dd.from_pandas(app_train, npartitions=10)
credit_card_balance_dd = dd.from_pandas(credit_card_balance_selected, npartitions=10)
app_test_dd = dd.from_pandas(app_test[['SK_ID_CURR']], npartitions=10)

merged_data_dd = dd.merge(app_train_dd, credit_card_balance_dd, on='SK_ID_CURR', how='left')
merged_data_dd = dd.merge(merged_data_dd, app_test_dd, on='SK_ID_CURR', how='left')
merged_data = merged_data_dd.compute()  # Convert the final Dask DataFrame to a Pandas DataFrame

# Handle missing values
imputer = SimpleImputer(strategy='mean')
merged_data['AMT_BALANCE'] = imputer.fit_transform(merged_data[['AMT_BALANCE']])
merged_data['SK_DPD'] = SimpleImputer(strategy='median').fit_transform(merged_data[['SK_DPD']])

# Map binary values to numeric
binary_map = {'Y': 1, 'N': 0, 'M': 0, 'F': 1}
merged_data['FLAG_OWN_CAR'] = merged_data['FLAG_OWN_CAR'].map(binary_map)
merged_data['CODE_GENDER'] = merged_data['CODE_GENDER'].map(binary_map)

# Define feature columns
input_parameters = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 'CNT_CHILDREN',
    'FLAG_OWN_CAR', 'CODE_GENDER', 'DAYS_CREDIT', 'DAYS_DECISION', 'AMT_PAYMENT', 
    'AMT_INSTALMENT', 'AMT_APPLICATION', 'NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE', 
    'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE'
]

# Filter columns available in merged_data
available_columns = [col for col in input_parameters if col in merged_data.columns]

# Print available columns for debugging
print("Available columns:", available_columns)

# Define numerical and categorical features
numerical_features = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 
    'CNT_CHILDREN'
]
categorical_features = ['NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE']

# Split data into features and target
training_data = merged_data[available_columns + ['TARGET']]
X = training_data[available_columns]
y = training_data['TARGET']

# Define the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a full pipeline that includes preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(silent=True, random_state=42))
])

# Train the pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# Save the pipeline
save_dir = 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\'
os.makedirs(save_dir, exist_ok=True)
joblib.dump(pipeline, os.path.join(save_dir, 'credit_model_pipeline.pkl'))

print("Pipeline saved successfully.")


Available columns: ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 'CNT_CHILDREN', 'FLAG_OWN_CAR', 'CODE_GENDER', 'NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE']
Pipeline saved successfully.
