In [202]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import math
from scipy import linalg
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [203]:
data = pd.read_csv("breast-cancer-wisconsin.data", sep = ",", header = None)
data.columns = ["code number", "Clump Thickness ", "Uniform Cell Size", "Uniform Cell Shape ", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class" ]
data.head()

Unnamed: 0,code number,Clump Thickness,Uniform Cell Size,Uniform Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [204]:
data.isnull().sum()
data["Bare Nuclei"].loc[data["Bare Nuclei"] == '?'] = np.nan
data.dropna(inplace=True)
data.drop(["code number"],axis = 1,inplace = True)
data["Bare Nuclei"] = data.astype({"Bare Nuclei": int})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [205]:
data[["Class"]] = data[["Class"]].replace(2, 0)
data[["Class"]] = data[["Class"]].replace(4, 1)

data[["Class"]].head()

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0


In [206]:
data.head()

Unnamed: 0,Clump Thickness,Uniform Cell Size,Uniform Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,5,3,1,1,0
1,5,4,4,5,7,5,3,2,1,0
2,3,1,1,1,2,3,3,1,1,0
3,6,8,8,1,3,6,3,7,1,0
4,4,1,1,3,2,4,3,1,1,0


**Using ScikitLearn** 

In [207]:
number = 10
total = 0 
for i in range(number):
    X=data.drop(['Class','Bare Nuclei'],axis=1)
    Y=data['Class'].values
    X_train, X_test, Y_train, Y_test = train_test_split(data.drop('Class',axis=1), data['Class'], test_size=0.20,random_state=0)
    classifier = LogisticRegression(max_iter = 200, solver = 'newton-cg')
    classifier.fit(X_train,Y_train)
    total =total+ classifier.score(X_test,Y_test)
ScikitAccuracy=total/number 

In [208]:
print("sklearn accuracy",ScikitAccuracy)

sklearn accuracy 0.9416058394160582


**Using Newtons Method**

In [209]:
class NewtonLogisticRegression:
    def __init__(self, X):
        self.beta_old_i = []
        self.beta_new_i = np.zeros(X.shape[1] + 1,dtype=int)
        
        
    def newtonRaphson(self, firstDer, secondDer):
        self.beta_new_i = self.beta_old_i - np.dot(linalg.inv(secondDer), firstDer)    
    
   
    def probabilityFun(self, X):
        z = np.dot(self.beta_new_i, X.T)
        p = math.e**z/(1 + math.e**z)
        return p

    
    def firstDerivative(self, X, Y, P):
        firstDer = np.dot((Y-P), X)
        return firstDer

    
    def secondDerivative(self, X, P):
        probMul = P*(1-P)
        xMulp = np.array([x*y for (x,y) in zip(X, probMul)])
        secondDer = -1*np.dot(xMulp.T,X)
        return secondDer+0.00001*np.random.rand(10, 10)

    
    
    
    #training the model
    def fit(self, X, Y, maxIteration=50, diffThreshHold=10**-5):
        #adding one additional column since we will have additional coefficient
        X = np.c_[X, np.array([1]*X.shape[0])]
        iteration = 0
        diffBetaList = []

        while(list(self.beta_new_i) != list(self.beta_old_i)):
            self.beta_old_i = self.beta_new_i
            P = self.probabilityFun(X)
            firstDer = self.firstDerivative(X, Y, P)
            secondDer = self.secondDerivative(X, P)
            self.newtonRaphson(firstDer, secondDer)
            #difference between last calcuated coefficients and current coefficients
            diff = linalg.norm(self.beta_new_i - self.beta_old_i)
            diffBetaList.append(diff)
            iteration += 1
            if(diff <= diffThreshHold or iteration > maxIteration):
                break
        
        return diffBetaList

    #predict probability any new data points
    def predict(self, X):
        X = np.c_[X, np.array([1]*X.shape[0])]
        probability = self.probabilityFun(X)
        return probability

    #classify based on provided classes
    def classification(self, X, dataClass):
        Y = self.predict(X)
        return [0 if item <= 0.05 else 1 for item in Y]
    
    

In [210]:
Acc=[]
for i in range(10):   
    X_train, X_test, y_train, y_test = train_test_split(data.drop('Class',axis=1), data['Class'],test_size=0.2,random_state = i)
    reg = NewtonLogisticRegression(X_train)
    x = reg.fit(X_train,y_train)
    pred = reg.classification(X_test, ["Benign","Malignant"])
    Acc.append(100*np.mean(pred == y_test))
print(" Classification accuracy: {:.2f}%".format(np.mean(Acc)))

 Classification accuracy: 94.09%
