In [1]:
import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split


# Logistic Regression from Scratch

In [2]:
class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0

        # gradient descent
        for _ in range(self.n_iters):
            # approximate y with linear combination of weights and x, plus bias
            linear_model = np.dot(X, self.weights) + self.bias
            # apply sigmoid function
            y_predicted = self._sigmoid(linear_model)

            # compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            # update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)
    
    def predict_probas(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        return np.array(y_predicted)
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))



In [3]:
file_path = r"C:\Users\gprak\Downloads\BITS Courses\CS F464 Machine Learning I\data\data4.xlsx"
df = pd.read_excel(file_path,header=None)

In [4]:
# !pip install openpyxl
print(df.shape)

(150, 8)


In [5]:
df.head(7)

Unnamed: 0,0,1,2,3,4,5,6,7
0,5.1,3.5,1.4,0.2,0.04,44.275,3455.15251,1
1,4.9,3.0,1.4,0.2,0.04,28.4,2829.15249,1
2,4.7,3.2,1.3,0.2,0.04,34.068,2297.95007,1
3,4.6,3.1,1.5,0.2,0.04,31.291,2064.22976,1
4,5.0,3.6,1.4,0.2,0.04,48.056,3130.0,1
5,5.4,3.9,1.7,0.4,0.16,61.019,4597.25024,1
6,4.6,3.4,1.4,0.3,0.09,40.704,2064.42976,1


In [6]:
list(df.columns)

[0, 1, 2, 3, 4, 5, 6, 7]

In [7]:
y=df[7].to_numpy()
X = df[[0,1,2,3,4,5,6]].to_numpy()



In [8]:
X.shape
y.shape

(150,)

# Holdout Cross Validation using 60-40 split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=1234
    )

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(90, 7)
(60, 7)
(90,)
(60,)


In [11]:
train_classes = np.unique(y_train)
test_classes = np.unique(y_test)
print(train_classes)
print(test_classes)

[1 2 3]
[1 2 3]


In [12]:
n_classes = len(train_classes)

In [13]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

## One vs all implementation

In [14]:
test_predictions = []

for i,class_val in enumerate(train_classes):
    class_1 = class_val
   
    
#     print(y_train_tmp)
    rest_classes=[]
    for j,class_val_rest in enumerate(train_classes):
        if class_1!=class_val_rest:
            rest_classes.append(class_val_rest)
            y_train_tmp = np.where(y_train == class_1, 1, 0)
            y_test_tmp = np.where(y_test == class_1, 1, 0)
    print(f"{class_1} vs {rest_classes}")
            
    regressor = LogisticRegression(learning_rate=0.001, n_iters=10000)
    regressor.fit(X_train, y_train_tmp)
    predictions = regressor.predict(X_test)
    print("LR one vs rest classification accuracy:", accuracy(y_test_tmp, predictions))
    y_test_probas = regressor.predict_probas(X_test)
    y_test_overall_tmp = [class_1 if y_test_probas[i]>=0.5 else y_test[i] for i in range(len(y_test))]
    test_predictions.append(y_test_overall_tmp)
    print("*"*100)
            

1 vs [2, 3]
LR one vs rest classification accuracy: 1.0
****************************************************************************************************
2 vs [1, 3]




LR one vs rest classification accuracy: 0.6166666666666667
****************************************************************************************************
3 vs [1, 2]
LR one vs rest classification accuracy: 0.7
****************************************************************************************************


In [15]:
postcompute_df=pd.DataFrame(test_predictions)
overall_test_predictions = postcompute_df.max().to_numpy()
print("LR one vs rest overall classification accuracy:", accuracy(overall_test_predictions, y_test))
    

LR one vs rest overall classification accuracy: 1.0


# One vs rest implementation

In [16]:
test_predictions = []
data = X_train
df_train = pd.DataFrame(data=data[0:,0:],
                        index=[i for i in range(data.shape[0])],
                        columns=['X'+str(i) for i in range(data.shape[1])])
df_train["class"]=y_train

data = X_test
df_test = pd.DataFrame(data=data[0:,0:],
                        index=[i for i in range(data.shape[0])],
                        columns=['X'+str(i) for i in range(data.shape[1])])
df_test['class']=y_test

            
            
for i,class_val in enumerate(train_classes):
    class_1 = class_val
    for j,class_val_other in enumerate(train_classes):
        if class_1!=class_val_other:
            
            df_train_tmp = df_train[(df_train["class"]==class_1) | (df_train["class"]==class_val_other)].copy()
#             print(df_train_tmp.head())
            df_train_tmp["class"] = np.where((df_train_tmp["class"]==class_1),1,0)
#             print(df_train_tmp.head())
            X_train_tmp = df_train_tmp[['X'+str(i) for i in range(data.shape[1])]].to_numpy()
            y_train_tmp = df_train_tmp["class"].to_numpy()
             
            df_test_tmp = df_test[(df_test["class"]==class_1) | (df_test["class"]==class_val_other)].copy()
            df_test_tmp["class"] = np.where((df_test_tmp["class"]==class_1),1,0)
            X_test_tmp = df_test_tmp[['X'+str(i) for i in range(data.shape[1])]].to_numpy()
            y_test_tmp = df_test_tmp["class"].to_numpy()
           
            
           
            regressor = LogisticRegression(learning_rate=0.1, n_iters=10000)
            regressor.fit(X_train_tmp, y_train_tmp)
            predictions = regressor.predict(X_test_tmp)
            test_prediction = regressor.predict_probas(X_test)
            test_prediction_class = [class_1 if test_prediction[i]>=0.5 else class_val_other for i in range(len(y_test))]
            print(f"For {class_1} vs {class_val_other}")
            print("LR one vs one classification accuracy:", accuracy(y_test_tmp, predictions))
            print(test_prediction_class)
            test_predictions.append(test_prediction_class)
            print("*"*100)
            


For 1 vs 2
LR one vs one classification accuracy: 1.0
[2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1]
****************************************************************************************************




For 1 vs 3
LR one vs one classification accuracy: 1.0
[3, 3, 3, 1, 3, 1, 1, 1, 3, 3, 3, 1, 3, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 1, 3, 3, 1, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1]
****************************************************************************************************
For 2 vs 1
LR one vs one classification accuracy: 1.0
[2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1]
****************************************************************************************************
For 2 vs 3
LR one vs one classification accuracy: 0.5609756097560976
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
******************************************************************************

In [17]:
postcompute_df=pd.DataFrame(test_predictions)
    

In [18]:
postcompute_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,2,2,2,1,2,1,1,1,2,2,...,1,2,2,2,2,2,2,2,2,1
1,3,3,3,1,3,1,1,1,3,3,...,1,3,3,3,3,3,3,3,3,1
2,2,2,2,1,2,1,1,1,2,2,...,1,2,2,2,2,2,2,2,2,1
3,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4,3,3,3,1,3,1,1,1,3,3,...,1,3,3,3,3,3,3,3,3,1
5,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [19]:
overall_test_predictions = postcompute_df.mode().to_numpy()

In [20]:
print("Overall LR one vs one classification accuracy:", accuracy(y_test, overall_test_predictions))

Overall LR one vs one classification accuracy: 0.7
