# Machine Learning


## Content:

### A. [Supervised Learning](#1)

  * [Logistic Regression Classification](#2)
  * [K-Nearest Neighbour (KNN) Classification](#3)
  * [Support Vector Machine (SVM) Classification](#4)
  * [Naive Bayes Classification](#5)
  * [Decision Tree Classification](#6)
  * [Random Forest Classification](#7)
  * [Evaluation Classification Models](#8)
  
### B. [Unsupervised Learning](#9)

  * [Kmeans Clustering](#10)
  * [Evaluation of Clustering](#11)
  * [Standardization](#12)
  * [Hierachy](#13)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv")

In [None]:
data.head()

In [None]:
data["class"].value_counts()
data["class"] = [1 if i=="Abnormal" else 0 for i in data["class"]]

* so, we updated "Abnornormal" values to 1 "Normal" values to 0

In [None]:
data["class"]

In [None]:
y = data["class"].values
x = data.drop(["class"],axis=1)

## Normalization

In [None]:
x_ = (x - np.min(x))/(np.max(x)-np.min(x)).values

<a id="1"></a> <br>
# A. Supervised Learning

<a id="2"></a> <br>
# Logistic Regression Classification

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x_,y,test_size=0.2,random_state=40)

x_train = x_train.T
x_test = x_test.T
y_train = y_train.T
y_test = y_test.T

## Parameter Initialize and Sigmoid Function

In [None]:
def initialize_weights_bias(dimension):
    
    w = np.full((dimension,1),0.01)
    b = 0.0
    return w,b

def sigmoid(z):
    
    y_head = 1/(1+np.exp(-z))
    return y_head

## Forward and Backward Propagation

In [None]:
def forward_backward_propagation(w,b,x_teain,y_train):
    z = np.dot(w.T,x_train) + b
    y_head = sigmoid(z)
    loss = -y_train*np.log(y_head)-(1-y_train)*np.log(1-y_head)
    cost = (np.sum(loss))/x_train.shape[1]      # x_train.shape[1]  is for scaling
    
    # backward propagation
    derivative_weight = (np.dot(x_train,((y_head-y_train).T)))/x_train.shape[1] # x_train.shape[1]  is for scaling
    derivative_bias = np.sum(y_head-y_train)/x_train.shape[1]                 # x_train.shape[1]  is for scaling
    gradients = {"derivative_weight": derivative_weight, "derivative_bias": derivative_bias}
    
    return cost,gradients

## Updating Parameters

In [None]:
def update(w, b, x_train, y_train, learning_rate,number_of_iterarion):
    cost_list = []
    cost_list2 = []
    index = []
    
    for i in range(number_of_iterarion):
        cost,gradients = forward_backward_propagation(w,b,x_train,y_train)
        cost_list.append(cost)
        w = w - learning_rate * gradients["derivative_weight"]
        b = b - learning_rate * gradients["derivative_bias"]
        if i % 10 == 0:
            cost_list2.append(cost)
            index.append(i)
            print ("Cost after iteration %i: %f" %(i, cost))
            
    parameters = {"weight": w,"bias": b}
    plt.plot(index,cost_list2)
    plt.xticks(index,rotation='vertical')
    plt.xlabel("Number of Iterarion")
    plt.ylabel("Cost")
    plt.show()
    return parameters, gradients, cost_list

## Prediction

In [None]:
def predict(w,b,x_test):
    z = sigmoid(np.dot(w.T,x_test)+b)
    Y_prediction = np.zeros((1,x_test.shape[1]))
    for i in range(z.shape[1]):
        if z[0,i]<= 0.5:
            Y_prediction[0,i] = 0
        else:
            Y_prediction[0,i] = 1

    return Y_prediction

In [None]:
def logistic_regression(x_train, y_train, x_test, y_test, learning_rate ,  num_iterations):
    # initialize
    dimension =  x_train.shape[0]  # that is 30
    w,b = initialize_weights_bias(dimension)
    # do not change learning rate
    parameters, gradients, cost_list = update(w, b, x_train, y_train, learning_rate,num_iterations)
    
    y_prediction_test = predict(parameters["weight"],parameters["bias"],x_test)

    # Print test Errors
    print("test accuracy: {} %".format(100 - np.mean(np.abs(y_prediction_test - y_test)) * 100))
    
logistic_regression(x_train, y_train, x_test, y_test,learning_rate = 1, num_iterations = 300)

## Sklearn with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train.T,y_train.T)
print("accurarcy {}".format(lr.score(x_test.T,y_test.T)))

<a id="3"></a> <br>
# K-Nearest Neighbour (KNN) Classification

In [None]:
y = data["class"].values 
x = data.drop(["class"],axis=1)

In [None]:
# Normalization
x_ = (x - np.min(x))/(np.max(x)-np.min(x))
# Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=21)

# Knn Model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)
print(" {} nn score: {} ".format(5,knn.score(x_test,y_test)))

In [None]:
# Find k value
score_list = []
for i in range(1,20):
    knn2 = KNeighborsClassifier(n_neighbors = i)
    knn2.fit(x_train,y_train)
    score_list.append(knn2.score(x_test,y_test))
    
plt.plot(range(1,20),score_list)
plt.xlabel("k values")
plt.ylabel("accuracy")
plt.show()
    

<a id="4"></a> <br>
# Support Vector Machine (SVM) Classification

In [None]:
y = data["class"].values 
x = data.drop(["class"],axis=1)

# Normalization
x_ = (x - np.min(x))/(np.max(x)-np.min(x))

# Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=21)

In [None]:
# SVM
from sklearn.svm import SVC
svm = SVC(random_state = 1)
svm.fit(x_train,y_train)
print("print accuracy of svm algo: ",svm.score(x_test,y_test))



<a id="5"></a> <br>
# Naive Bayes Classification

In [None]:
y = data["class"].values 
x = data.drop(["class"],axis=1)

# Normalization
x_ = (x - np.min(x))/(np.max(x)-np.min(x))

# Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=21)

In [None]:
# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train,y_train)

print("score: ", nb.score(x_test,y_test))

<a id="6"></a> <br>
# Decision Tree Classification

In [None]:
y = data["class"].values 
x = data.drop(["class"],axis=1)

# Normalization
x_ = (x - np.min(x))/(np.max(x)-np.min(x))

# Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=21)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)

print("score: ", dt.score(x_test,y_test))

<a id="7"></a> <br>
# Random Forest Classification

In [None]:
y = data["class"].values 
x = data.drop(["class"],axis=1)

# Normalization
x_ = (x - np.min(x))/(np.max(x)-np.min(x))

# Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=21)

In [None]:
#%%  random forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,random_state = 1)
rf.fit(x_train,y_train)
print("random forest algo result: ",rf.score(x_test,y_test))

<a id="8"></a> <br>
# Evaluation Classification Models

In [None]:
y = data["class"].values 
x = data.drop(["class"],axis=1)

# Normalization
x_ = (x - np.min(x))/(np.max(x)-np.min(x))

# Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=21)

In [None]:
y_pred = rf.predict(x_test)
y_true = y_test

#%% confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true,y_pred)


# %% cm visualization
import seaborn as sns
import matplotlib.pyplot as plt

f, ax = plt.subplots(figsize =(5,5))
sns.heatmap(cm,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()


<a id="9"></a> <br>
# B. Unsupervised Learning

<a id="10"></a> <br>
# Kmeans Clustering

In [None]:
data = pd.read_csv("/kaggle/input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv")
plt.scatter(data['pelvic_radius'],data['degree_spondylolisthesis'])
plt.xlabel('pelvic_radius')
plt.ylabel('degree_spondylolisthesis')
plt.show()

In [None]:
data2 = data.loc[:,['degree_spondylolisthesis','pelvic_radius']]
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 4)
kmeans.fit(data2)
labels = kmeans.predict(data2)
plt.scatter(data['pelvic_radius'],data['degree_spondylolisthesis'],c=labels)
plt.xlabel('pelvic_radius')
plt.ylabel('degree_spondylolisthesis')
plt.show()

<a id="11"></a> <br>
# Evaluation of Clustering

In [None]:
df = pd.DataFrame({'labels':labels,"class":data['class']})
ct = pd.crosstab(df['labels'],df['class'])
print(ct)

In [None]:
# inertia
inertia_list = np.empty(8)
for i in range(1,8):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(data2)
    inertia_list[i] = kmeans.inertia_
plt.plot(range(0,8),inertia_list,'-o')
plt.xlabel('Number of cluster')
plt.ylabel('Inertia')
plt.show()

<a id="12"></a> <br>
# Standardization

In [None]:
data = pd.read_csv("/kaggle/input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv")
data3 = data.drop('class',axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
scalar = StandardScaler()
kmeans = KMeans(n_clusters=2)
pipe = make_pipeline(scalar,kmeans)
pipe.fit(data3)
labels = pipe.predict(data3)
df = pd.DataFrame({'labels':labels,"class":data["class"]})
ct = pd.crosstab(df['labels'],df['class'])
print(ct)

<a id="13"></a> <br>
# Hierachy

In [None]:
from scipy.cluster.hierarchy import linkage,dendrogram
merg = linkage(data3.iloc[200:220,:],method = 'single')
dendrogram(merg,leaf_rotation=90, leaf_font_size=6)
plt.show()