 # Import Data and Packages

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import classification_report

sampleEntry = pd.read_csv('../input/sampleEntry.csv')
df = pd.read_csv('../input/cs-training.csv')
test = pd.read_csv('../input/cs-test.csv')
test.head()

# Variable explanation

**SeriousDlqin2yrs**:

**RevolvingUtilizationOfUnsecuredLines**:

**age**:

**NumberOfTime30-59DaysPastDueNotWorse**:	

**DebtRatio**:

**MonthlyIncome**:

**NumberOfOpenCreditLinesAndLoans**:

**NumberOfTimes90DaysLate**:

**NumberRealEstateLoansOrLines**:

**NumberOfTime60-89DaysPastDueNotWorse**:	

**NumberOfTime60-89DaysPastDueNotWorse:**



# Data Cleaning

In [None]:
# descriptive statistics
df.describe()

In [None]:
# renaming model columns
df.rename(columns={'Unnamed: 0':'Id',
                          'SeriousDlqin2yrs':'Default'}, 
                 inplace=True)

In [None]:
# count of null values across variables
df.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
# fill na values appropriately
df['MonthlyIncome'].fillna(df['MonthlyIncome'].mean(),inplace=True)
df['NumberOfDependents'].fillna(test['NumberOfDependents'].mode()[0], inplace=True)

# check
df.isnull().sum()

# Data Exploration


In [None]:
# maybe change color scheme
cor=df.corr()
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(cor,xticklabels=cor.columns,yticklabels=cor.columns,annot=True,ax=ax)

In [None]:
sns.countplot(x='Default',data=df,palette='RdBu_r')
plt.title('Default Outcomes')
print("Percentage of People Who Defaulted: {}%".format(df["Default"].sum()*100 / len(df)))


In [None]:
sns.kdeplot(df.loc[df["Default"] == 0]["age"], label="Not in Default")
sns.kdeplot(df.loc[df["Default"] == 1]["age"], label="In Default")
plt.xlabel('Age')
plt.title('Distribuition of Default Rate by Age')

In [None]:
sns.distplot(df.MonthlyIncome)

# Feature Selection

In [None]:
import statsmodels.formula.api as sm

def vif_cal(input_data, dependent_col):
    x_vars=input_data.drop([dependent_col], axis=1)
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.ols(formula="y~x", data=x_vars).fit().rsquared  
        vif=round(1/(1-rsq),2)
        print (xvar_names[i], " VIF = " , vif)
        
#Calculating VIF values using that function
vif_cal(input_data=df, dependent_col="Default")

We should consider selecting only one of the Days Past Due features as there is significant multicollinearity between the variables.

# Logistic Regression Model

In [None]:
# splitting data into train and test set
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

features=['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio','MonthlyIncome','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents']
dep=['Default']
x=df[features]
y=df[dep]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)


#
scaler = StandardScaler()
scaler.fit(x_train.fillna(0))
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1'))
sel_.fit(scaler.transform(x_train.fillna(0)), y_train)
sel_.get_support()

In [None]:
# splitting data into train and test set
from sklearn.feature_selection import SelectFromModel

features=['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','MonthlyIncome','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents']
dep=['Default']
x=df[features]
y=df[dep]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

logmodel = LogisticRegression()
logmodel.fit(x_train, y_train)
import sklearn.metrics as metrics

# implementing model and scoring
predictions = logmodel.predict(x_test)
print(classification_report(y_test, predictions))
print("Accuracy:",metrics.accuracy_score(y_test, predictions))


In [None]:
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_test,predictions,figsize=(6,6))

In [None]:
# roc curve and auc
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
# generate 2 class dataset
features=['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio','MonthlyIncome']
dep=['Default']
x=df[features]
y=df[dep]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
# fit a model
logmodel = LogisticRegression()
logmodel.fit(x_train, y_train)
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = logmodel.predict_proba(x_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('Random Classifier: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='Random Classifier')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

# Trying different model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)

In [None]:
y_pred2 = knn.predict_proba(x_train)
y_pred2 = y_pred2[:,1]


In [None]:
from sklearn.metrics import auc,roc_curve
fpr,tpr,_ = roc_curve(y_train, y_pred2)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10,8))
plt.title('Receiver Operating Characteristic')
sns.lineplot(fpr, tpr, label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Apply and test our Logistic Regression model

In [None]:
test=pd.read_csv('../input/cs-test.csv')

In [None]:
print(test.isnull().sum())

In [None]:
test['MonthlyIncome'].fillna(test['MonthlyIncome'].mean(),inplace=True)

In [None]:
xtest=test[features]

In [None]:
xtest.head()

In [None]:
ytest=logmodel.predict_proba(xtest)

In [None]:
print(ytest)

In [None]:
testing=pd.DataFrame(ytest,columns=['Id','Probability'])

In [None]:
testing.head()

In [None]:
dataf=pd.DataFrame(ytest,columns=['Id','Probability'])

dataf.head()

In [None]:
export_csv = df.to_csv('export_dataframe.csv',index = None,header=True)

# Different feature selection techniques wit Logistic Regression

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
# Data
X = df[['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberOfTimes90DaysLate','NumberRealEstateLoansOrLines','NumberOfTime60-89DaysPastDueNotWorse']]
y = df['Default']

# Data standarlization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create StandardScaler instance and fit_trainsform
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

# Create SMOTE instance
smote = SMOTE(sampling_strategy="auto", random_state=10)

# data split
X_train_resampled, y_train_resampled = smote.fit_sample(X_train_std, y_train)

In [None]:
# Create logistic regression instance
lr = LogisticRegression()

from sklearn.model_selection import GridSearchCV
# Grid search
param_range = [0.001, 0.01, 0.1, 1.0, 10, 100]
penalty = ['l1', 'l2']
param_grid = [{"C":param_range, "penalty":penalty}]

gs = GridSearchCV(estimator=lr, param_grid=param_grid, scoring="recall", cv=10, n_jobs=-1)
gs = gs.fit(X_train_resampled, y_train_resampled)

print(gs.best_score_.round(3))
print(gs.best_params_)

In [None]:
clf_lr = gs.best_estimator_
print('Test accuracy: %.3f' % clf_lr.score(X_test_std, y_test))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
y_pred = clf_lr.predict(X_test_std)
y_pred_train = clf_lr.predict(X_train_std)

# Validation of model
print("confusion_matrix = \n", confusion_matrix(y_true=y_test, y_pred=y_pred))

print("*accuracy_train = %.3f" % accuracy_score(y_true=y_train, y_pred=y_pred_train))
print("accuracy = %.3f" % accuracy_score(y_true=y_test, y_pred=y_pred))

print("*precision_train = %.3f" % precision_score(y_true=y_train, y_pred=y_pred_train))
print("precision = %.3f" % precision_score(y_true=y_test, y_pred=y_pred))

print("*recall_train = %.3f" % recall_score(y_true=y_train, y_pred=y_pred_train))
print("recall = %.3f" % recall_score(y_true=y_test, y_pred=y_pred))

print("*f1_score_train = %.3f" % f1_score(y_true=y_train, y_pred=y_pred_train))
print("f1_score = %.3f" % f1_score(y_true=y_test, y_pred=y_pred))

In [None]:
# ROC curve and AUC
y_score = clf_lr.predict_proba(X_test_std)[:, 1]

fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_score)
# Visualization
plt.figure(figsize=(10,6))
plt.plot(fpr, tpr, label="roc curve (area = %.3f)" % auc(fpr, tpr))
plt.plot([0,1], [0,1], linestyle='--', label='random')
plt.plot([0,0,1], [0,1,1], linestyle='--', label="ideal")
plt.legend()
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")

In [None]:
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_test,y_pred,figsize=(6,6))

In [None]:
print(classification_report(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

grid search technique prooved to be inferior in this case