<a href="https://colab.research.google.com/github/shekharkoirala/machinelearning_algorithms_analysis/blob/master/bestfitline/bestfitline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Logistic Regression analysis**

**Using sklearn**

**step 1: Download the diabetes data from source**

In [0]:
# download the data
!wget https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/diabetes.csv

In [0]:
# import libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn import datasets
%matplotlib inline

#datatype 1
df = pd.read_csv("diabetes.csv")
df.head(3)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'Outcome'], df['Outcome'], stratify=df['Outcome'], random_state=66)


In [0]:
logreg = LogisticRegression().fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

In [0]:
predict = logreg.predict(X_test)
predict

![alt text](https://scikit-learn.org/stable/_images/sphx_glr_plot_logistic_001.png)

**Plotting the graph ,  but our data is not in the following format**

**Our data has multiple features and comparing any one of the features with outcomes , we have not dictinctive features as above,**


In [0]:
plt.scatter(X_train['Insulin'], y_train, edgecolor='k')
plt.show()

**so lets try PCA to reduce the dimension of dataset , and see , if the features becomes distinctive**

In [0]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X_test)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['data1', 'data2'])

plt.scatter(principalDf['data2'], y_test, edgecolor='k')
# plt.plot(principalDf['data2'], predict, color='blue', linewidth=3)

# plt.xticks(())
# plt.yticks(())

plt.show()

**So , the traditional best fit line approach is not suitable in logistic regression , rather than plotting the line, we can  scatter plot it. 
Red plots are valid test data , and blue plots are predicted test data**

In [0]:
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
fig = plt.figure(figsize=(25, 18), dpi=80)
n = 100
ax = fig.add_subplot(111, projection='3d')
xs = principalDf['data1']
ys = principalDf['data2']
zs = y_test
ax.scatter(xs, ys, zs, c="r", alpha = 0.7, marker= "o")
xs = principalDf['data1']
ys = principalDf['data2']
zs = predict
ax.scatter(xs, ys, zs, c="b", alpha = 0.7, marker = '^')
plt.show()

# **Logistic regression in Tensorflow**

**Vanilla model**

In [0]:
import tensorflow as tf
tf.reset_default_graph()
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np
import time


#parameters

learning_rate = 0.001
batch_size = 128
n_epochs = 30
n_train = 60000
n_test = 10000

# step 1 : Read data
mnist_folder = 'data/mnist'
mnist = input_data.read_data_sets(mnist_folder, one_hot=True)
X_batch , Y_batch = mnist.train.next_batch(batch_size)


#step 2 : ceate placeholder s
X = tf.placeholder(tf.float32, [batch_size, 784], name="image")
Y = tf.placeholder(tf.int32, [batch_size,10], name="label")


# step 3 : initialize weight and bias
# weight : initialize weight mean : 0 , std-dev : 0.1
# tf.random_normal_initializer()
# bias : 0.0 , tf.zeros_initializer()
# img = (?,784) * weight = (784,10) = (?,10) + bias (1,10)
w = tf.get_variable("weight", shape=(784, 10),
                    initializer=tf.random_normal_initializer(mean=0,
                                                             stddev=0.1))
b = tf.get_variable("bias", shape=(1, 10), initializer=
                    tf.zeros_initializer())


# step 4: model , logits , model that return logits
logits = tf.matmul(X, w) + b

#step 5 : softmax
entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y ,
                                                  name = "loss")
loss = tf.reduce_mean(entropy)


# st
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

preds = tf.nn.softmax(logits)
correct_pred = tf.equal(tf.argmax(preds,1), tf.argmax(Y,1))
accuracy = tf.reduce_sum(tf.cast(correct_pred, tf.float32))

start = time.time()

writer = tf.summary.FileWriter('./graphs/logreg_placeholder',
                               tf.get_default_graph())

with tf.name_scope("summaries_loss"):
    tf.summary.scalar("loss", loss)
    tf.summary.histogram("loss histogram", loss)
    summary_op1 = tf.summary.merge_all()

with tf.name_scope("summaries_accuracy"):
    tf.summary.scalar("accuracy", accuracy)
    tf.summary.histogram("accuracy histogram", accuracy)
    summary_op2 = tf.summary.merge_all()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    n_batches = int(mnist.train.num_examples/batch_size)
    for i in range(n_epochs):
        total_loss = 0
        for j in range(n_batches):
            X_batch , Y_batch = mnist.train.next_batch(batch_size)
            _, loss_batch, summary = sess.run([optimizer, loss, summary_op1],
                                      feed_dict={X:X_batch, Y:Y_batch})
            total_loss += loss_batch
        writer.add_summary(summary, global_step=i)
        print('Average loss epoch {0}: {1}'.format(i, total_loss/n_batches))
    print('Total time: {0} seconds'.format(time.time() -start))

    total_correct_preds = 0
    n_batches = int(mnist.test.num_examples / batch_size)
    for i in range(n_batches):
        X_batch, Y_batch = mnist.test.next_batch(batch_size)
        accuracy_batch, summary = sess.run([accuracy, summary_op2], feed_dict={
            X:X_batch,Y:Y_batch})
        total_correct_preds +=accuracy_batch
        writer.add_summary(summary, global_step=i)

    print('Accuracy {0}'.format(total_correct_preds/mnist.test.num_examples))

    writer.close()

The model's ***accuracy*** could be improved by using , one more **hidden layer** and** regularization **

The model's ***time*** could be improved by using **dataset** instead of **placeholder.**

In [0]:
import numpy as np
import os
import urllib
import gzip
import shutil
import struct

os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

from matplotlib import pyplot as plt
import tensorflow as tf

def read_birth_life_data(file_name):
    with open(file_name) as f:
        text = f.read().splitlines()[1:]
        data = [line.split('\t') for line in text]
        births = [float(line[1]) for line in data]
        lifes = [float(line[2]) for line in data]
        data = list(zip(births, lifes))
        n_samples = len(data)
        data = np.asarray(data, dtype=np.float32)
    return data, n_samples


def download_one_file(download_url,
                    local_dest,
                    expected_byte=None,
                    unzip_and_remove=False):
    """
    Download the file from download_url into local_dest
    if the file doesn't already exists.
    If expected_byte is provided, check if
    the downloaded file has the same number of bytes.
    If unzip_and_remove is True, unzip the file and remove the zip file
    """
    if os.path.exists(local_dest) or os.path.exists(local_dest[:-3]):
        print('%s already exists' %local_dest)
    else:
        print('Downloading %s' %download_url)
        local_file, _ = urllib.request.urlretrieve(download_url, local_dest)
        file_stat = os.stat(local_dest)
        if expected_byte:
            if file_stat.st_size == expected_byte:
                print('Successfully downloaded %s' %local_dest)
                if unzip_and_remove:
                    with gzip.open(local_dest, 'rb') as f_in, open(local_dest[:-3],'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                    os.remove(local_dest)
            else:
                print('The downloaded file has unexpected number of bytes')

def download_mnist(path):
    """
    Download and unzip the dataset mnist if it's not already downloaded
    Download from http://yann.lecun.com/exdb/mnist
    """
    safe_mkdir(path)
    url = 'http://yann.lecun.com/exdb/mnist'
    filenames = ['train-images-idx3-ubyte.gz',
                'train-labels-idx1-ubyte.gz',
                't10k-images-idx3-ubyte.gz',
                't10k-labels-idx1-ubyte.gz']
    expected_bytes = [9912422, 28881, 1648877, 4542]

    for filename, byte in zip(filenames, expected_bytes):
        download_url = os.path.join(url, filename)
        local_dest = os.path.join(path, filename)
        download_one_file(download_url, local_dest, byte, True)

def safe_mkdir(path):
    """ Create a directory if there isn't one already. """
    try:
        os.mkdir(path)
    except OSError:
        pass


def read_mnist(path, flatten=True, num_train=55000):
    """
    Read in the mnist dataset, given that the data is stored in path
    Return two tuples of numpy arrays
    ((train_imgs, train_labels), (test_imgs, test_labels))
    """
    imgs, labels = parse_data(path, 'train', flatten)
    indices = np.random.permutation(labels.shape[0])
    train_idx, val_idx = indices[:num_train], indices[num_train:]
    train_img, train_labels = imgs[train_idx, :], labels[train_idx, :]
    val_img, val_labels = imgs[val_idx, :], labels[val_idx, :]
    test = parse_data(path, 't10k', flatten)
    return (train_img, train_labels), (val_img, val_labels), test


def get_mnist_dataset(batch_size):
    # Step 1: Read in data
    mnist_folder = 'data/mnist'
    download_mnist(mnist_folder)
    train, val, test = read_mnist(mnist_folder, flatten=False)

    # Step 2: Create datasets and iterator
    train_data = tf.data.Dataset.from_tensor_slices(train)
    train_data = train_data.shuffle(10000) # if you want to shuffle your data
    train_data = train_data.batch(batch_size)

    test_data = tf.data.Dataset.from_tensor_slices(test)
    test_data = test_data.batch(batch_size)

    return train_data, test_data

def parse_data(path, dataset, flatten):
    if dataset != 'train' and dataset != 't10k':
        raise NameError('dataset must be train or t10k')

    label_file = os.path.join(path, dataset + '-labels-idx1-ubyte')
    with open(label_file, 'rb') as file:
        _, num = struct.unpack(">II", file.read(8))
        labels = np.fromfile(file, dtype=np.int8)  # int8
        new_labels = np.zeros((num, 10))
        new_labels[np.arange(num), labels] = 1

    img_file = os.path.join(path, dataset + '-images-idx3-ubyte')
    with open(img_file, 'rb') as file:
        _, num, rows, cols = struct.unpack(">IIII", file.read(16))
        imgs = np.fromfile(file, dtype=np.uint8).reshape(num, rows,
                                                         cols)  # uint8
        imgs = imgs.astype(np.float32) / 255.0
        if flatten:
            imgs = imgs.reshape([num, -1])

    return imgs, new_labels

def show(image):
    """
    Render a given numpy.uint8 2D array of pixel data.
    """
    plt.imshow(image, cmap='gray')
    plt.show()

def huber_loss(labels, predictions, delta=14.0):
    residual = tf.abs(labels - predictions)
    def f1(): return 0.5 * tf.square(residual)
    def f2(): return delta * residual - 0.5 * tf.square(delta)
    return tf.cond(residual < delta, f1, f2)

In [0]:
import tensorflow as tf
tf.reset_default_graph()
import numpy as np
import time
import os

# import utils

#parameters

learning_rate = 0.001
batch_size = 128
n_epochs = 100
n_train = 60000
n_test = 10000
n_nodes = 1024

# step 1 : Read data
mnist_folder = 'data/mnist'
download_mnist(mnist_folder)
train, val, test = read_mnist(mnist_folder, flatten=True)


# step2 : create dataset and iterators
train_data = tf.data.Dataset.from_tensor_slices(train)
train_data = train_data.shuffle(1000)
train_data = train_data.batch(batch_size)

test_data = tf.data.Dataset.from_tensor_slices(test)
test_data = test_data.batch(batch_size)

# create one iteration and initialize it from different dataset
iterator = tf.data.Iterator.from_structure(train_data.output_types,
                                           train_data.output_shapes)
img, label = iterator.get_next()

train_init = iterator.make_initializer(train_data) # initializer for traindata
test_init = iterator.make_initializer(test_data)  #

# step 3 : initialize weight and bias
# weight : initiali weight mean : 0 , std-dev : 0.1
# tf.random_normal_initializer()
# bias : 0.0 , tf.zeros_initializer()
# img = (?,784) * weight = (784,10) = (?,10) + bias (1,10)
print(img.shape, label.shape)
w1 = tf.get_variable("weight1", shape=(int(img.shape[1]), n_nodes),
                     initializer=tf.random_normal_initializer(mean=0,
                                                              stddev=0.01))
b1 = tf.get_variable("bias1", shape=(1, n_nodes), initializer=
                     tf.zeros_initializer())

w2 = tf.get_variable("weight2", shape=(n_nodes, int(label.shape[1])),
                     initializer=tf.random_normal_initializer(mean=0,
                                                              stddev=0.01))
b2 = tf.get_variable("bias2", shape=(1, int(label.shape[1])), initializer=
                     tf.zeros_initializer())

# step 4: model , logits , model that return logits
logits1 = tf.add(tf.matmul(img, w1), b1)
relu_layer1 = tf.nn.relu(logits1)

logits2 = tf.add(tf.matmul(relu_layer1, w2), b2)
relu_layer2 = tf.nn.relu(logits2)


# step 5 : entropy and loss
entropy1 = tf.nn.softmax_cross_entropy_with_logits_v2(logits=relu_layer2,
                                                      labels=label,
                                                      name="loss")
loss = tf.reduce_mean(entropy1)


# adding regularization
beta = 0.01
regularizer = tf.nn.l2_loss(w2)
loss = tf.reduce_mean(loss + beta * regularizer)



#step 6 : optimizer
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) #


correct_predictions = tf.equal(tf.argmax(relu_layer2, 1), tf.argmax(label, 1))
accuracy = tf.reduce_sum(tf.cast(correct_predictions, tf.float32))


with tf.name_scope("summary_loss"):
    tf.summary.scalar("loss", loss)
    tf.summary.histogram("loss histogram", loss)
    summary_op1 = tf.summary.merge_all()

with tf.name_scope("summary_accuracy"):
    tf.summary.scalar("loss", accuracy)
    tf.summary.histogram("loss histogram", accuracy)
    summary_op2 = tf.summary.merge_all()

writer = tf.summary.FileWriter("./graphs/logdatareg", tf.get_default_graph())

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    start_time = time.time()
    for i in range(n_epochs):
        sess.run(train_init)
        total_loss = 0
        n_batches =0
        try:
            while True:
                _, _loss, summary = sess.run([optimizer, loss, summary_op1])
                total_loss +=_loss
                n_batches +=1

        except tf.errors.OutOfRangeError:
            pass
        writer.add_summary(summary, global_step=i)
        print('Average loss epoch {0} : {1}'.format(i, total_loss/n_batches))
    print("total time: {0}".format(time.time() - start_time))

    # for accuracy
    sess.run(test_init)
    total_correct_pred =0
    i = 0
    try:
        while True:
            accuracy_batch, summary = sess.run([accuracy, summary_op2])
            total_correct_pred += accuracy_batch
            i += 1
            writer.add_summary(summary, global_step=i)
    except tf.errors.OutOfRangeError:
        pass
    print("ACCURACY {0} ".format(total_correct_pred/n_test))
    writer.close()

