# Default Data

In [1]:
import pandas as pd

In [3]:
# reading the data 
df_default = pd.read_csv('Default-1.csv')
df_default.head(2)

Unnamed: 0,default,student,balance,income
0,No,0,729.526495,44361.62507
1,No,1,817.180407,12106.1347


In [4]:
# group 1 of customers: default is Yes
df_default[df_default['default']=='Yes'].describe().round(2)

Unnamed: 0,student,balance,income
count,333.0,333.0,333.0
mean,0.38,1747.82,32089.15
std,0.49,341.27,13804.22
min,0.0,652.4,9663.79
25%,0.0,1511.61,19027.51
50%,0.0,1789.09,31515.34
75%,1.0,1988.87,43067.33
max,1.0,2654.32,66466.46


In [5]:
# group 2 of customers: default is No
df_default[df_default['default']=='No'].describe().round(2)

Unnamed: 0,student,balance,income
count,9667.0,9667.0,9667.0
mean,0.29,803.94,33566.17
std,0.45,456.48,13318.25
min,0.0,0.0,771.97
25%,0.0,465.71,21405.06
50%,0.0,802.86,34589.49
75%,1.0,1128.25,43823.76
max,1.0,2391.01,73554.23


In [7]:
# create features and target sets -- features, we only use 'balance'
x_default, y_default = df_default[['balance']], df_default['default']
display(x_default.head(3))
display(y_default.head(3))

Unnamed: 0,balance
0,729.526495
1,817.180407
2,1073.549164


0    No
1    No
2    No
Name: default, dtype: object

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [9]:
# Splitting to train and test
x_train, x_test, y_train, y_test = train_test_split(x_default, y_default, test_size=.2, random_state=0)

In [10]:
# Define the model
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)

LogisticRegression()

In [11]:
# Evaluate performance of the model
print('log reg acc on train data: {:.2%}'.format(log_reg.score(x_train, y_train)))
print('log reg acc on test data: {:.2%}'.format(log_reg.score(x_test, y_test)))

log reg acc on train data: 97.36%
log reg acc on test data: 96.95%


In [14]:
# Make predictions
p1 = [600]
p2 = [1700]
p3 = [2800]
log_reg.predict([p1, p2, p3])



array(['No', 'No', 'Yes'], dtype=object)

In [16]:
# change the scientific disply to float
import numpy as np
np.set_printoptions(suppress = True)

In [18]:
# looking at the probabilities of the target variable
log_reg.predict_proba([p1, p2, p3]).round(4)



array([[0.9993, 0.0007],
       [0.7923, 0.2077],
       [0.0096, 0.9904]])

In [19]:
# from this method (classes), we know the first col above contains the probablility of default being 'No', 
# and second the probablility of deafult being 'Yes'
log_reg.classes_

array(['No', 'Yes'], dtype=object)

In [21]:
# coefficients
log_reg.coef_.round(4)

array([[0.0054]])

In [22]:
# intercept
log_reg.intercept_.round(4)

array([-10.579])

<span class="mark">log-odds(default) = -10.579 + 0.0054 balance</span>

# Diabetes Data

In [25]:
# Reading the data
diabetes = pd.read_csv('diabetes-1.csv')
diabetes.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [None]:
# create features and target sets
# split to train and test
# define a Logistic Regression Model (max_iter = 1000)
# evaluate the performance
# make a prediction for this person (p1 = [3, 152, 72, 35, 100, 31.7, .7, 65])

In [29]:
# create features and target sets
x_diabetes, y_diabetes = diabetes.iloc[:, :-1], diabetes['Outcome']
display(x_diabetes.head(3))
display(y_diabetes.head(3))

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32


0    1
1    0
2    1
Name: Outcome, dtype: int64

In [42]:
# Splitting to train and test (default is 25% test)
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_diabetes, y_diabetes, random_state=0)

In [43]:
# Define the model and fit it
log_reg1 = LogisticRegression(max_iter = 1000)
log_reg1.fit(x_train1, y_train1)

LogisticRegression(max_iter=1000)

In [44]:
# Evaluate performance of the model
print('log reg acc on train data: {:.2%}'.format(log_reg1.score(x_train1, y_train1)))
print('log reg acc on test data: {:.2%}'.format(log_reg1.score(x_test1, y_test1)))

log reg acc on train data: 76.04%
log reg acc on test data: 79.69%


In [53]:
# Make predictions
p4 = [3, 152, 72, 35, 100, 31.7, .7, 65]
p5 = [1, 180, 69, 31, 0, 28.1, .432, 50]
p6 = [0, 130, 60, 24, 0, 23.1, .310, 29]

log_reg1.predict([p4, p5, p6])



array([1, 1, 0])

In [54]:
# change the scientific disply to float
import numpy as np
np.set_printoptions(suppress = True)

In [59]:
# looking at the probabilities of the target variable
log_reg1.predict_proba([p4, p5, p6]).round(4)



array([[0.2659, 0.7341],
       [0.2658, 0.7342],
       [0.8419, 0.1581]])

In [60]:
# from this method (classes), we know the first col above contains the probablility of default being 'No', 
# and second the probablility of deafult being 'Yes'
log_reg1.classes_

array([0, 1])

In [61]:
# coefficients
log_reg1.coef_.round(4)

array([[ 0.0862,  0.0331, -0.0112,  0.0061, -0.001 ,  0.088 ,  0.8002,
         0.0222]])

In [62]:
# intercept
log_reg1.intercept_.round(4)

array([-8.3788])

# Contrilling the complexity of a logistic regression, using parameter 'C'

In [63]:
# Higher 'C' means training as good as possible, which means more complex model 
#  ----> higher chance of overfitting

# Lower 'C' means more regularization, less complex (simpler) model, 
#   ----> Lower chance of overfitting
#      -----> If we keep decreasing 'C', model will underfit

In [65]:
# reading breast cancer data
cancer = pd.read_csv('breast_cancer_data-1.csv', index_col = 0)
cancer.head(3)

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [66]:
# Create features and target sets
x_cancer, y_cancer = cancer.iloc[:,1:], cancer['diagnosis']

#check my work
display(x_cancer.head(3))
display(y_cancer.head(3))

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


id
842302      M
842517      M
84300903    M
Name: diagnosis, dtype: object

## Transform the features

In [67]:
from sklearn.preprocessing import MinMaxScaler

In [68]:
# Defining the scaler and fitting it
scaler1 = MinMaxScaler()
x_cancer_trns = scaler1.fit_transform(x_cancer)
x_cancer_trns

array([[0.52103744, 0.0226581 , 0.54598853, ..., 0.91202749, 0.59846245,
        0.41886396],
       [0.64314449, 0.27257355, 0.61578329, ..., 0.63917526, 0.23358959,
        0.22287813],
       [0.60149557, 0.3902604 , 0.59574321, ..., 0.83505155, 0.40370589,
        0.21343303],
       ...,
       [0.45525108, 0.62123774, 0.44578813, ..., 0.48728522, 0.12872068,
        0.1519087 ],
       [0.64456434, 0.66351031, 0.66553797, ..., 0.91065292, 0.49714173,
        0.45231536],
       [0.03686876, 0.50152181, 0.02853984, ..., 0.        , 0.25744136,
        0.10068215]])

## Validation Curve Functions

In [70]:
from sklearn.model_selection import validation_curve

In [71]:
C_range = [.1, .5, 1, 10, 100, 200]

In [72]:
# validation_curve has two outputs:
## 1- scores on train 2- scores on tests
### the order is important
train_scores, test_scores = validation_curve(LogisticRegression(solver='liblinear'), x_cancer, y_cancer, 
                                             param_name = 'C', param_range = C_range, cv = 4)

In [73]:
train_scores.round(4)

array([[0.9531, 0.9532, 0.9438, 0.9578],
       [0.9577, 0.9555, 0.9508, 0.9672],
       [0.9554, 0.9578, 0.9555, 0.9649],
       [0.9695, 0.9719, 0.9625, 0.9696],
       [0.9742, 0.9742, 0.9672, 0.9789],
       [0.9765, 0.9766, 0.9672, 0.9766]])

In [79]:
# axis = 1 calculates mean for each row
print('ave cross val scores on train:', train_scores.mean(axis = 1).round(4))

ave cross val scores on train: [0.952  0.9578 0.9584 0.9684 0.9736 0.9742]


In [80]:
test_scores.round(4)

array([[0.9021, 0.9648, 0.9577, 0.9085],
       [0.9301, 0.9366, 0.9718, 0.9296],
       [0.9301, 0.9366, 0.9718, 0.9366],
       [0.9371, 0.9507, 0.9718, 0.9437],
       [0.9371, 0.9437, 0.9718, 0.9507],
       [0.9301, 0.9437, 0.9718, 0.9507]])

In [81]:
# C=10 has the highest test score
print('ave cross val scores on test:', test_scores.mean(axis = 1).round(4))

ave cross val scores on test: [0.9333 0.942  0.9438 0.9508 0.9508 0.9491]


In [82]:
C_range

[0.1, 0.5, 1, 10, 100, 200]