In [1]:
def load_csv(filename):
    import pandas as pd
    df = pd.read_csv(filename,sep=",")
    df.rename(columns=lambda x: x.replace(" ", "_"),inplace=True)
    return df

In [2]:
class LogisticRegression:   
    def __init__(self, Predictors, Response, learning_rate=0.001, iterations=10000):
        import numpy as np
        """
        Initialize Parameters
        """
        # Predictors are the input variables from the data
        self.Predictors = np.concatenate((np.ones((np.shape(Predictors)[0],1),dtype=int),Predictors),axis=1)
        
        # Response is the output variable from the data
        self.Response = Response          
        self.lr = learning_rate
        self.iters = iterations
        
        # There is a weight associated with the intercept and a weight for each feature
        self.m,self.n = self.Predictors.shape
        self.weights = np.ones(self.n,dtype=int)
        self.weights[0] = 0
        
    def sigm(self,Z):
        """
        Sigmoid Function 
        sigm(Z) = exp(Z) / (1 + exp(Z)) = 1 / (1 + exp(-Z) 
        """
        import numpy as np
        return 1 / (1 + np.exp(-Z))
    
    def mu(self,X):
        """
        mu(X) = E[y | x] = p(y=1 | x) = sigm(w.T * X)
        """
        import numpy as np
        return self.sigm(np.dot(X,self.weights.T))
    
    def classification(self):
        import numpy as np
        """
        Steepest Descent
        Weights = Weights - Learning_Rate * Gradient
        """
        diff = np.inf
        iterations = self.iters
        
        while diff > 0.01 and iterations > 0:  
            p_hat = self.mu(self.Predictors)
            gradient = np.dot(self.Predictors.T,(p_hat - self.Response))
            self.weights = self.weights - np.dot(self.lr,gradient)
                        
            diff = np.abs(gradient).sum()
            iterations -= 1
        return self.weights
    
    def decision_rule(self,Y):
        """
        y_hat(x) = 1 if and only if p(y=1 | x) >= 0.5
        """
        for i in range(len(Y)):
            if Y[i] >= 0.5:
                Y[i] = 1
            else:
                Y[i] = 0
        return Y
            
    def prediction(self, Predictors = []):
        import numpy as np
        if not Predictors: Predictors = self.Predictors
        y_pred = self.mu(Predictors)
        y_pred = self.decision_rule(y_pred)
        return y_pred

In [3]:
data = load_csv('Breast_Cancer_Wisconsin.csv')
data.columns

Index(['ID_Number', 'Clump_Thickness', 'Unif_Cell_Size', 'Unif_Cell_Shape',
       'Marg_Adhesion', 'Single_Epith_Cell_Size', 'Bare_Nuclei',
       'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [4]:
for i in data.columns:
    print(data[i].value_counts())

1182404    6
1276091    5
1198641    3
897471     2
1116192    2
          ..
1232225    1
1236043    1
1241232    1
1241559    1
814265     1
Name: ID_Number, Length: 645, dtype: int64
1     145
5     130
3     108
4      80
10     69
2      50
8      46
6      34
7      23
9      14
Name: Clump_Thickness, dtype: int64
1     384
10     67
3      52
2      45
4      40
5      30
8      29
6      27
7      19
9       6
Name: Unif_Cell_Size, dtype: int64
1     353
2      59
10     58
3      56
4      44
5      34
6      30
7      30
8      28
9       7
Name: Unif_Cell_Shape, dtype: int64
1     407
3      58
2      58
10     55
4      33
8      25
5      23
6      22
7      13
9       5
Name: Marg_Adhesion, dtype: int64
2     386
3      72
4      48
1      47
6      41
5      39
10     31
8      21
7      12
9       2
Name: Single_Epith_Cell_Size, dtype: int64
1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: Bare_Nuclei, d

In [5]:
import numpy as np
data.replace({'?': np.nan}, inplace = True)
data.dropna()
print(data['Bare_Nuclei'].value_counts())

1     402
10    132
2      30
5      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: Bare_Nuclei, dtype: int64


In [6]:
import pandas as pd
drop_list = ['ID_Number','Bare_Nuclei','Class']
Predictors = data.drop(drop_list,axis=1)
Response = data['Class'].replace({2:0,4:1})
print(Response.value_counts())

0    458
1    241
Name: Class, dtype: int64


In [7]:
x = [[1,5,1,1,1,2,3,1,1],[1,7,3,2,10,5,5,4,4],[1,10,8,4,3,2,3,1,2]]
regressor = LogisticRegression(Predictors, Response)
regressor.classification()
pred = regressor.prediction(x)
print("Weights: ", regressor.weights)
print(); print()
print("Predictions")
print(pred)

Weights:  [-9.93790637  0.57702178 -0.01124845  0.5676396   0.31346293  0.13030858
  0.57943805  0.12321472  0.60675861]


Predictions
[0. 1. 1.]
