In [None]:
import pandas as pd
df = pd.read_csv(r'dataset.csv',delimiter = ';')
print(df.shape)
print(df.dtypes.sort_values())

In [None]:
## distribution of dependent variable
df['default'].value_counts()

In [None]:
from sklearn import preprocessing
columns = ['name_in_email', 'merchant_group', 'merchant_category']
for column in columns:
    le = preprocessing.LabelEncoder()
    le.fit(df[column])
    df.loc[:, column] = le.transform(df[column])

## Segrate train and test data

In [None]:
test_df = df.loc[df['default'].isnull()]
train_df = df.loc[~(df['default'].isnull())]
print(test_df.shape)
print(train_df.shape)

In [None]:
## drop uuid from train data
train_df = train_df.drop(columns = ['uuid'])
train_df.shape

## Missing value distribution and imputation
### Todo:
#### 1. Use different than mean for imputation
#### 2. Use automated imputation using modeling
#### 3. There is model specific imputation

In [None]:
## columns with total missing values
missing_count = train_df.isnull().sum().sort_values(ascending = False)
missing_df = pd.DataFrame(missing_count, columns = ['Frequency'])
missing_df['PrecentageMissing'] = (missing_df['Frequency'] / train_df.shape[0]) * 100
missing_df

In [None]:
## Drop columns with missing percentage more than 50
drop_columns = missing_df.loc[missing_df['PrecentageMissing'] >= 40].index
drop_columns

In [None]:
train_df.drop(columns=drop_columns, inplace = True)
train_df.shape

In [None]:
### Missing value imputation using mean
mean = train_df.mean()
train_df = train_df.fillna(mean)
test_df = test_df.fillna(mean)
y = train_df['default']
X = train_df.drop(columns = ['default'])

## Divide dataset into train and test 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Baseline model 

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter = 500)
model.fit(X_train, y_train)
print("training accuracy: {}".format(model.score(X_train, y_train)))
print("Test accuracy: {}".format(model.score(X_test, y_test)))

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print("training accuracy: {}".format(clf.score(X_train, y_train)))
print("Test accuracy: {}".format(clf.score(X_test, y_test)))

## Prediction on True Test Data

In [None]:
## Do the same preprocessing that was done on train data
## 1. Missing value imputation - Done
## 2. categorical to numerical conversion - Done

## Oversampling using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter = 500)
model.fit(X_train, y_train)
print("Training accuracy: {:.2f}%".format(model.score(X_train, y_train) * 100))
print("Test accuracy    : {:.2f}%".format(model.score(X_test, y_test) * 100))

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(bootstrap= True, criterion = 'entropy', n_estimators =100)
clf.fit(X_train, y_train)
print("Training accuracy: {:.2f}%".format(clf.score(X_train, y_train) * 100))
print("Test accuracy    : {:.2f}%".format(clf.score(X_test, y_test) * 100))

In [None]:
uuid = test_df['uuid']
test_df.drop(columns = ['uuid', 'default'] + drop_columns.tolist(), inplace = True)
p_default = clf.predict_proba(test_df)
pred_df = pd.DataFrame({'uuid': uuid, 'pd': p_default[:, 1]})
pred_df.to_csv(r'output.csv', sep = ';', index = False)

In [None]:
import joblib
joblib.dump(clf, 'model')

## Gridsearch using cross validation
### Find best parameter for random forest, logistic regression and support vector machine

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:
# param_grid = [
#   {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
#   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
#  ]
param_grid = [{'C': [1, 100,1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]
model = SVC(random_state=42)
clf = GridSearchCV(model, param_grid, cv = 3, verbose = 5)
clf.fit(X_train, y_train)

In [None]:
param_grid = [{'n_estimators': [10, 50, 100], 'criterion': ['gini', 'entropy'], 'bootstrap': [True]}]
model = RandomForestClassifier(random_state=42)
clf = GridSearchCV(model, param_grid, cv = 3, verbose = 5)
clf.fit(X_train, y_train)

In [None]:
print(clf.best_score_)
print(clf.best_params_)

In [1]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])