In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pathlib import Path
import gc

In [None]:
RESOURCE_LIMITED = False
no_rows = 10000 if RESOURCE_LIMITED else 1_000_000

# <div style='background :#90EE90' width='100%'> Reading Train Data </div>

In [None]:
input_path = Path('/kaggle/input/amex-default-prediction/')
train_data_path = os.path.join(input_path,'train_data.csv')

train_data = pd.read_csv(
    train_data_path,
    nrows=no_rows)
train_data.head()

In [None]:
selected_columns = train_data.iloc[:,0:50].columns
# selected_columns = train_data[['P_2', 'D_48', 'B_2', 'D_61', 'B_18', 'D_55', 'B_9', 'D_44', 'B_33', 'customer_ID']].columns
train_data = train_data[selected_columns]
train_data

# <div style='background :#90EE90' width='100%'> Reading Train Labels</div>

In [None]:
train_labels_path = os.path.join(input_path,'train_labels.csv')
train_labels = pd.read_csv(
    train_labels_path,
    nrows=no_rows)
train_labels.head()

# <div style='background :#90EE90' width='100%'> Reading Test Data </div>

In [None]:
test_data_path = os.path.join(input_path,'test_data.csv')
#Add nrows = no_rows
test_data = pd.read_csv(test_data_path, usecols=selected_columns)
test_data.head()

# <div style='background :#90EE90' width='100%'> Encoding Categorical Data </div>

In [None]:
def encode_cat_data(df):
    train_data = df.copy()
    #Pop customer ID column
    customer_ids = train_data.pop('customer_ID')
    #Drop S_2 column
    train_data = train_data.drop('S_2',axis=1)
    categorical_columns = train_data[train_data.select_dtypes('object').columns]
    train_data = train_data.drop(train_data.select_dtypes('object').columns,axis=1)
    try:
        encoded_columns = pd.get_dummies(categorical_columns)
    except:
        encoded_columns = pd.DataFrame()
    if(len(encoded_columns)>0):
        train_data = pd.concat([train_data,encoded_columns],axis=1)
    train_data = pd.concat([customer_ids,train_data],axis=1)
    return train_data

In [None]:
encoded_data = encode_cat_data(train_data)
# encoded_data.drop('D_64_-1',axis=1,inplace=True)
encoded_data.head()

gc.collect()

# <div style='background :#90EE90' width='100%'> Null Value Handling </div>

In [None]:
threshold = 0.5
clmns_to_drop = encoded_data.columns[encoded_data.isnull().sum()/len(encoded_data) < threshold]

def handle_na(df,clmns_to_drop):
    train_data = df.copy()
    #Drop columns with 50% or more null values
    train_data = train_data[clmns_to_drop]
    train_data = train_data.fillna(train_data.select_dtypes(include='number').median())
    return train_data
    

In [None]:
null_handled_data = handle_na(encoded_data,clmns_to_drop)
null_handled_data

gc.collect()

# <div style='background :#90EE90' width='100%'> Group By Operation on Train Data </div>

In [None]:
def groupby_data(df):
    train_data = df.copy()
    train_data = train_data.groupby(['customer_ID'],as_index=False)[train_data.columns].mean()
    return train_data

In [None]:
gby_data = groupby_data(null_handled_data)

gc.collect()

# <div style='background :#90EE90' width='100%'> Merge Train Data and Labels </div>

In [None]:
def merge_data_labels(df,train_labels):
    train_data = df.copy()
    train_data = train_data.merge(train_labels, how = 'inner', on = 'customer_ID')
    return train_data

In [None]:
merged_data = merge_data_labels(gby_data,train_labels)
merged_data

gc.collect()

# <div style='background :#90EE90' width='100%'> Train-Test Split Model Evaluation </div>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = merged_data.copy()
X.drop('customer_ID',axis=1,inplace=True)
y = X.pop('target')

# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model on the training set
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model's performance on the testing set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}".format(accuracy))

# <div style='background :#90EE90' width='100%'> Preprocessing Test Data </div>

In [None]:
def preprocess_data(df):
    local_df = df.copy()
    local_df = encode_cat_data(local_df)
    local_df = handle_na(local_df, clmns_to_drop)
    local_df = groupby_data(local_df)
    return local_df
    
    

In [None]:
processed_test = preprocess_data(test_data)
processed_test

gc.collect()

# <div style='background :#90EE90' width='100%'> Prediction - Random Forest Classifier</div>

In [None]:
def rfc_model(train_data,test_data):
    X_train = train_data.copy()
    X_test = test_data.copy()
    X_train.drop('customer_ID',axis=1,inplace=True)
    X_test.drop('customer_ID',axis=1,inplace=True)
    y_train = X_train.pop('target')
    rf_classifier = RandomForestClassifier()
    rf_classifier.fit(X_train,y_train)
    
    return rf_classifier.predict(X_test)
    

# <div style='background :#90EE90' width='100%'> Prediction - Support Vector Classifier</div>

In [None]:
from sklearn.svm import SVC # "Support vector classifier"  

def svc_model(train_data, test_data):
    X_train = train_data.copy()
    X_test = test_data.copy()
    X_train.drop('customer_ID',axis=1,inplace=True)
    X_test.drop('customer_ID',axis=1,inplace=True)
    y_train = X_train.pop('target')
    svc_classifier = SVC(kernel='linear', random_state=0)  
    svc_classifier.fit(X_train, y_train)
    
    return svc_classifier.predict(X_test)

# <div style='background :#90EE90' width='100%'> Prediction - XGBoost</div>

In [None]:
import xgboost
def xgb_model(train_data,test_data):
    X_train = train_data.copy()
    X_test = test_data.copy()
    X_train.drop('customer_ID',axis=1,inplace=True)
    X_test.drop('customer_ID',axis=1,inplace=True)
    y_train = X_train.pop('target')
    model=xgboost.XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.15, subsample=0.5)
    model.fit(X_train, np.ravel(y_train, order='C'))
    return  model.predict(X_test)

# <div style='background :#90EE90' width='100%'> Create Submission File </div>

In [None]:
# y_pred_rfc = rfc_model(merged_data,processed_test)
y_pred_rfc = xgb_model(merged_data,processed_test)
prediction_df = processed_test[['customer_ID']].copy()
prediction_df['prediction'] = y_pred_rfc.tolist()
prediction_df.to_csv('submission.csv', index=False)
