In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
app_train = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/application_train.csv")
app_test = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/application_test.csv")
credit_card_balance = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/credit_card_balance.csv")
bureau = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/bureau.csv")
previous = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/previous_application.csv")
installments = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/installments_payments.csv")

In [2]:
pip install dask[dataframe]

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [4]:
import dask.dataframe as dd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
import pandas as pd
import os

# Function to reduce memory usage
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:  # Exclude string columns
            if pd.api.types.is_integer_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='integer')
            elif pd.api.types.is_float_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='float')
    return df

# Reduce memory usage
app_train = reduce_memory_usage(app_train)
app_test = reduce_memory_usage(app_test)
credit_card_balance = reduce_memory_usage(credit_card_balance)

# Merge data
columns_to_merge = ['SK_ID_CURR', 'AMT_BALANCE', 'SK_DPD']
credit_card_balance_selected = credit_card_balance[columns_to_merge]

app_train_dd = dd.from_pandas(app_train, npartitions=10)
credit_card_balance_dd = dd.from_pandas(credit_card_balance_selected, npartitions=10)
app_test_dd = dd.from_pandas(app_test[['SK_ID_CURR']], npartitions=10)

merged_data_dd = dd.merge(app_train_dd, credit_card_balance_dd, on='SK_ID_CURR', how='left')
merged_data_dd = dd.merge(merged_data_dd, app_test_dd, on='SK_ID_CURR', how='left')
merged_data = merged_data_dd.compute()

# Handle missing values
imputer = SimpleImputer(strategy='mean')
merged_data['AMT_BALANCE'] = imputer.fit_transform(merged_data[['AMT_BALANCE']])
merged_data['SK_DPD'] = SimpleImputer(strategy='median').fit_transform(merged_data[['SK_DPD']])

# Map binary values to numeric
binary_map = {'Y': 1, 'N': 0, 'M': 0, 'F': 1}
merged_data['FLAG_OWN_CAR'] = merged_data['FLAG_OWN_CAR'].map(binary_map)
merged_data['CODE_GENDER'] = merged_data['CODE_GENDER'].map(binary_map)

# Define feature columns
input_parameters = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 'CNT_CHILDREN',
    'FLAG_OWN_CAR', 'CODE_GENDER', 'DAYS_CREDIT', 'DAYS_DECISION', 'AMT_PAYMENT', 
    'AMT_INSTALMENT', 'AMT_APPLICATION', 'NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE', 
    'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE'
]

# Filter columns available in merged_data
available_columns = [col for col in input_parameters if col in merged_data.columns]

# Print available columns for debugging
print("Available columns:", available_columns)

# Define numerical and categorical features
numerical_features = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 
    'CNT_CHILDREN'
]
categorical_features = ['NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE']

# Split data into features and target
training_data = merged_data[available_columns + ['TARGET']]
X = training_data[available_columns]
y = training_data['TARGET']

# Define the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a full pipeline that includes preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# Save the pipeline
save_dir = 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\'
os.makedirs(save_dir, exist_ok=True)
joblib.dump(pipeline, os.path.join(save_dir, 'credit_model_pipeline.pkl'))

print("Pipeline saved successfully.")


Available columns: ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 'CNT_CHILDREN', 'FLAG_OWN_CAR', 'CODE_GENDER', 'NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE']
Pipeline saved successfully.
