In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score # to split the data
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn import preprocessing
# from sklearn.metrics import accuracy_score

# Algorithmns models to be compared
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [0]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
org_df = pd.read_csv('german_credit_data.csv')

In [3]:
org_df.shape

(1000, 11)

In [4]:
df1 = org_df.drop(['Unnamed: 0'], axis=1)

In [5]:
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
Age                 1000 non-null int64
Sex                 1000 non-null object
Job                 1000 non-null int64
Housing             1000 non-null object
Saving accounts     817 non-null object
Checking account    606 non-null object
Credit amount       1000 non-null int64
Duration            1000 non-null int64
Purpose             1000 non-null object
Risk                1000 non-null object
dtypes: int64(4), object(6)
memory usage: 78.2+ KB
None


In [6]:
df1.shape

(1000, 10)

In [7]:
df1.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'Purpose', 'Risk'],
      dtype='object')

In [8]:
df1.describe()

Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903
std,11.375469,0.653614,2822.736876,12.058814
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


In [9]:
df1.corr()

Unnamed: 0,Age,Job,Credit amount,Duration
Age,1.0,0.015673,0.032716,-0.036136
Job,0.015673,1.0,0.285385,0.21091
Credit amount,0.032716,0.285385,1.0,0.624984
Duration,-0.036136,0.21091,0.624984,1.0


In [10]:
df1.nunique()

Age                  53
Sex                   2
Job                   4
Housing               3
Saving accounts       4
Checking account      3
Credit amount       921
Duration             33
Purpose               8
Risk                  2
dtype: int64

In [11]:
df_good = df1.loc[df1["Risk"] == 'good']['Age'].values.tolist()
df_bad = df1.loc[df1["Risk"] == 'bad']['Age'].values.tolist()
df_age = df1['Age'].values.tolist()

In [12]:
interval = (18, 25, 35, 60, 120)

cats = ['Student', 'Young', 'Adult', 'Senior']
df1["Age_cat"] = pd.cut(df1.Age, interval, labels=cats)


df_good = df1[df1["Risk"] == 'good']
df_bad = df1[df1["Risk"] == 'bad']

In [13]:
def one_hot_encoder(df1, nan_as_category = False):
    original_columns = list(df1.columns)
    categorical_columns = [col for col in df.columns if df1[col].dtype == 'object']
    df1 = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category, drop_first=True)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df1, new_columns

In [14]:
df1['Saving accounts'] = df1['Saving accounts'].fillna('no_inf')
df1['Checking account'] = df1['Checking account'].fillna('no_inf')

#Purpose to Dummies Variable
df1 = df1.merge(pd.get_dummies(df1.Purpose, drop_first=True, prefix='Purpose'), left_index=True, right_index=True)
#Sex feature in dummies
df1 = df1.merge(pd.get_dummies(df1.Sex, drop_first=True, prefix='Sex'), left_index=True, right_index=True)
# Housing get dummies
df1 = df1.merge(pd.get_dummies(df1.Housing, drop_first=True, prefix='Housing'), left_index=True, right_index=True)
# Housing get Saving Accounts
df1 = df1.merge(pd.get_dummies(df1["Saving accounts"], drop_first=True, prefix='Savings'), left_index=True, right_index=True)
# Housing get Risk
df1 = df1.merge(pd.get_dummies(df1.Risk, prefix='Risk'), left_index=True, right_index=True)
# Housing get Checking Account
df1 = df1.merge(pd.get_dummies(df1["Checking account"], drop_first=True, prefix='Check'), left_index=True, right_index=True)
# Housing get Age categorical
df1 = df1.merge(pd.get_dummies(df1["Age_cat"], drop_first=True, prefix='Age_cat'), left_index=True, right_index=True)

In [15]:
#Excluding the missing columns
del df1["Saving accounts"]
del df1["Checking account"]
del df1["Purpose"]
del df1["Sex"]
del df1["Housing"]
del df1["Age_cat"]
del df1["Risk"]
del df1['Risk_good']

In [16]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
Age                            1000 non-null int64
Job                            1000 non-null int64
Credit amount                  1000 non-null int64
Duration                       1000 non-null int64
Purpose_car                    1000 non-null uint8
Purpose_domestic appliances    1000 non-null uint8
Purpose_education              1000 non-null uint8
Purpose_furniture/equipment    1000 non-null uint8
Purpose_radio/TV               1000 non-null uint8
Purpose_repairs                1000 non-null uint8
Purpose_vacation/others        1000 non-null uint8
Sex_male                       1000 non-null uint8
Housing_own                    1000 non-null uint8
Housing_rent                   1000 non-null uint8
Savings_moderate               1000 non-null uint8
Savings_no_inf                 1000 non-null uint8
Savings_quite rich             1000 non-null uint8
Savings_rich               

In [17]:
df1 = df1.sample(frac=1)

In [18]:
x = df1.drop('Risk_bad', 1).values

y = df1['Risk_bad'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state=42)

In [19]:
# normalized_x = preprocessing.normalize(x)

In [20]:
#Bagging Classifier
bag = BaggingClassifier(n_estimators=2)
fitted_bag = bag.fit(x, y)
y_pred = bag.predict(x)
accuracy_score(y,y_pred)

0.903

In [21]:
prob_scoress = fitted_bag.predict_proba(x)

In [22]:
pred_prob_df = pd.DataFrame(prob_scoress.tolist())

In [23]:
org_df['Probability of predicting 1'] = pred_prob_df[1]

In [24]:
org_df

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Probability of predicting 1
0,0,67,male,2,own,,little,1169,6,radio/TV,good,0.0
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,1.0
2,2,49,male,1,own,little,,2096,12,education,good,1.0
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good,1.0
4,4,53,male,2,free,little,little,4870,24,car,bad,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good,0.0
996,996,40,male,3,own,little,little,3857,30,car,good,0.0
997,997,38,male,2,own,little,,804,12,radio/TV,good,0.5
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad,0.0


In [26]:
org_df.random()

AttributeError: 'DataFrame' object has no attribute 'random'

In [0]:
df1['prediction'] = y_pred.tolist()

In [0]:
org_df['predicted'] = y_pred.tolist()

In [0]:
import pickle

with open('/content/drive/My Drive/Analytics Projects | POC/Risk Analytics/trained_model.model','wb') as f:
  pickle.dump(fitted_bag, f)

In [0]:
#Logistics Regression
lr = LogisticRegression(C = 3.0, solver = 'liblinear', multi_class = 'auto')
lr.fit(x,y)
y_predict = lr.predict(x_test)
accuracy_score(y_test,y_predict)

0.75

In [0]:
#Random Forest 
rf_model = RandomForestClassifier(n_estimators= 2)
rf_model.fit(x,y)
test_pred = rf_model.predict(x_test)
accuracy_score(y_test, test_pred)

0.845

In [0]:
#Naive bayes
NB = GaussianNB()
nb_model = NB.fit(x,y)
y_predi = nb_model.predict(x_test)
accuracy_score(y_test,y_predi)

0.73

In [0]:
#Decision Tree
DT = DecisionTreeClassifier()
dt_model = DT.fit(x,y)
test_predict = dt_model.predict(x_test)
accuracy_score(y_test,test_predict)

1.0

In [0]:
#SVC 
clf = SVC(C=7, gamma = 'auto', kernel = 'linear', class_weight='balanced')
svc_trained_model = clf.fit(x,y)
y_predt = clf.predict(x_test)
accuracy_score(y_test, y_predt)