In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import sklearn.model_selection as ms

In [2]:
df = pd.read_csv("haberman.data", header=None)

In [3]:
df

Unnamed: 0,0,1,2,3
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1
...,...,...,...,...
301,75,62,1,1
302,76,67,0,1
303,77,65,3,1
304,78,65,1,2


In [4]:
# checking for missing data
df.isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

In [5]:
# separating Y
df_y = df[3]
df_y-=1
# 0 if survived
# 1 if not survived

In [6]:
# dropping Y column
df.drop([3], axis=1, inplace=True)

In [7]:
# split into train and test sets
train_size=0.8
X_train, X_test, y_train, y_test = ms.train_test_split(df.to_numpy(), df_y.to_numpy(), train_size=train_size)

In [8]:
# fit normalizer on training set and use it to transform test set
normalizer = StandardScaler()  
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [9]:
import sklearn.linear_model as lm

In [10]:
log_reg = lm.LogisticRegression()
log_reg.fit(X_train_norm, y_train)
print("Coefficients:", log_reg.coef_)
print("Intercept:", log_reg.intercept_)

Coefficients: [[ 0.3934274  -0.09069496  0.59927071]]
Intercept: [-1.06273887]


In [11]:
y_pred = log_reg.predict(X_test_norm)
print(y_pred)
# 0 if survived
# 1 if not survived

[0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]


In [12]:
def classification_error(y_pred, y_actual):
    error = y_pred!=y_actual
    return np.sum(error)/len(error)

In [13]:
print("Classification Error:", classification_error(y_pred, y_test))

Classification Error: 0.24193548387096775


In [14]:
# combining into one function
def train_log_reg(df, df_y, train_size):
    X_train, X_test, y_train, y_test = ms.train_test_split(df.to_numpy(), df_y.to_numpy(), train_size=train_size)
    normalizer = StandardScaler()  
    X_train_norm = normalizer.fit_transform(X_train)
    X_test_norm = normalizer.transform(X_test)
    log_reg = lm.LogisticRegression()
    log_reg.fit(X_train_norm, y_train)
    y_pred = log_reg.predict(X_test_norm)
    print("Classification Error:", classification_error(y_pred, y_test))

In [15]:
train_log_reg(df, df_y, train_size=0.8)

Classification Error: 0.20967741935483872


In [16]:
train_sizes=[0.6, 0.7, 0.8, 0.9]
for ts in train_sizes:
    print(ts, ":")
    train_log_reg(df, df_y, train_size=ts)
    print()

0.6 :
Classification Error: 0.2682926829268293

0.7 :
Classification Error: 0.2608695652173913

0.8 :
Classification Error: 0.22580645161290322

0.9 :
Classification Error: 0.25806451612903225



In [17]:
# 0.8 gives best performance