In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# TOC

[PCA using numpy](#tag1)

[Gradient descent](#tag2)

[K-means](#tag3)

[Naive Bayes](#tag4)

[KNN](#tag5)

In [None]:
df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')
df.head()

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
import matplotlib.pyplot as plt
df.hist(figsize=(8,6));

<a id='tag1'></a>
## PCA from Scratch

* Scale the matrix
* Get covariate matrix
* Singular Value Decomposition from the cov_matrix 
* Sort and find the n largest eigen vector and corresponding eigen values
* Get the dot product of the scaled metrix with the eigen vector 

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
## scale
num_cols = [c for c in df.columns if df[c].dtypes!='object' and c!='price']
sc = StandardScaler()
num_col_scale = sc.fit_transform(df[num_cols])

## get covariate matrix
cov_matrix = np.cov(num_col_scale.T)

## eigen value decomposition
eigval, eigvec  = np.linalg.eig(cov_matrix)

## calculate explained variance
explained_var = eigval / np.sum(eigval)

## plot each component
plt.plot(np.arange(len(explained_var)), np.cumsum(explained_var), 'bo')

In [None]:
eigval

In [None]:
## sort the eigen values from the largest to the smallest
n_components = 2
idx = eigval.argsort()[::-1]
eigval_sorted = eigval[idx][:2]
eigvec_sorted = np.atleast_1d(eigvec[:, idx])[:, :n_components]

In [None]:
## transform all the num_col
num_col_scale.dot(eigval_sorted.T[0]).shape

In [None]:
## get the dot product
pc1 = num_col_scale.dot(eigvec_sorted.T[0])
pc2 = num_col_scale.dot(eigvec_sorted.T[1])

import seaborn as sns
sns.scatterplot(pc1, pc2, hue=df.price);

<a id='tag2'></a>
## Gradient descent 

Reference: https://towardsdatascience.com/gradient-descent-from-scratch-e8b75fa986cc

Taking partial derivatives of m and b from the loss function of OLS:

$$\frac{\partial f}{\partial b} = \frac{1}{n}\Sigma^n_{i=1}[-2(y_i-(mx_i+b))]$$
$$\frac{\partial f}{\partial m} = \frac{1}{n}\Sigma^n_{i=1}[-2x_i(y_i-(mx+b))]$$

These are the gradients

and to update m and b

$m := m - \lambda \frac{ \partial{f}}{\partial{m}}$

$b := b- \lambda \frac{\partial{f}}{\partial{b}}$

In [None]:
from sklearn.metrics import mean_squared_error


def gradient_descent(X, y, lr=0.001, epoch=20):
    m, b= 0.3, 0.4
    log, mse = [], []
    
    N = len(X)
    
    for _ in range(epoch):
        f = y-(m*X + b)
        
        b -= lr*(-2*f.sum()/N)
        
        m -= lr*(-2*X.dot(f).sum()/N)
        
        log.append((m, b))
        
        mse.append(mean_squared_error(y, (m*X + b)))
        
    return m, b, log, mse

In [None]:
X = [np.random.randint(1,10,1) for _ in range(10)]
X = np.asarray(X).reshape(-1)
y = 0.44*X + 0.6

In [None]:
plt.plot(X, y, 'ro')

In [None]:
m, b, log, mse = gradient_descent(X, y, epoch=30)

In [None]:
plt.plot(np.arange(30), [i[0] for i in log])
plt.xlabel('epoch')
plt.ylabel('M value');

In [None]:
plt.plot(np.arange(30), [i[1] for i in log])
plt.ylabel('b value');

In [None]:
plt.plot(np.arange(30), mse)
plt.ylabel('MSE');

<a id='tag3'></a>
## K means

Reference https://medium.com/@rishit.dagli/build-k-means-from-scratch-in-python-e46bf68aa875

In [None]:
df.columns

In [None]:
plt.plot(df.lat, df.long, 'ro');

In [None]:
class KMeans:
    
    def __init__(self, k, tol= 0.0001, max_iter = 200):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        
    def fit(self, X, ):
        
        self.centroids = {}
        
        for i in range(self.k):
            self.centroids[i] = X[random.choice(len(X))]
            
        for i in range(self.max_iter):
            self.classes = {}
            for j in range(self.k):
                self.classes[j] = []
                
            for feature in X:
                distances = [np.linalg.norm(feature - self.centroids[c]) for c in self.controids]
                classes = distances.index(min(distances))
            
                self.classes[classes].append(feature)
                
            prev_centroids = dict(self.centroids)
            
            for class_ in self.classes:
                self.centroids[class_] = np.average(self.classes[class_], axis=0)
            
            optimized = False
            for c in self.centroids:
                original_c = prev_centroids[c]
                current_c = self.centroids[c]
                
                if np.sum((current_c - original_c)/original_c*100)> self.tol:
                    print(np.sum((current_c -original_c )/original_c *100.0))
                    optimized = False
                    
                    
            if optimized:
                break
                
    def predict(test_X):
        distances = [np.linalg.norm(test_X - self.centroids[c]) for c in self.centroids]
        
        classes = distances.index(min(distances))
        return classes

<a id='tag4'></a>

## Naive Bayes Classifier

[ref1](https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/supervised_learning/naive_bayes.py)

[ref2](https://github.com/tigju/Naive-Bayes-Classifier-from-scratch/blob/main/naive_bayes.ipynb)

[ref3](https://chrisalbon.com/code/machine_learning/naive_bayes/naive_bayes_classifier_from_scratch/)

In [None]:
## this example is from : https://chrisalbon.com/code/machine_learning/naive_bayes/naive_bayes_classifier_from_scratch/
import pandas as pd

df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')

In [None]:
df_nb = df[['bedrooms', 'floors', 'sqft_living', 'grade']]

In [None]:
## for simplicity we use two claases
df_nb['grade'] = df_nb.grade.apply(lambda x: True if x > 8 else False)

In [None]:
df_nb.dtypes



$$ p (grade| condition) = \frac{p(grade)p(condition|grade)}{p(condition)}$$

### calculate prior

$p(grade)$ is prior 

In [None]:

condition_cols = df_nb.columns[:5]
condition_cols


In [None]:
priors = pd.DataFrame(df_nb.groupby('grade')['bedrooms'].count()/df_nb.shape[0])

In [None]:
priors

### Likelihood

To get the likehood $p(condition|grade)$, we first assume that each feature distributed normally, so that using the normal distribution pdf we get 

https://en.wikipedia.org/wiki/Normal_distribution

$$f(x) = -\frac{1}{\sigma \sqrt{2\pi}} e^{-\frac{1}{2}(\frac{x-\mu}{\theta})^2}$$

In [None]:
## to get the mean
df_mean = df_nb.groupby('grade').mean()
df_mean

In [None]:
df_variance = df_nb.groupby('grade').var()
df_variance

In [None]:
## now we calculate the mean and variance for each condition

## mean for grade=True
bed_true_mean = df_mean['bedrooms'][df_variance.index==True].values[0]
floor_true_mean = df_mean['floors'][df_variance.index==True].values[0]
sqft_true_mean = df_mean['sqft_living'][df_variance.index==True].values[0]

## mean for grade=False
bed_false_mean = df_mean['bedrooms'][df_variance.index==False].values[0]
floor_false_mean = df_mean['floors'][df_variance.index==False].values[0]
sqft_false_mean = df_mean['sqft_living'][df_variance.index==False].values[0]

## var for grade=True
bed_true_var = df_variance['bedrooms'][df_variance.index==True].values[0]
floor_true_var = df_variance['floors'][df_variance.index==True].values[0]
sqft_true_var = df_variance['sqft_living'][df_variance.index==True].values[0]

## var for grade = False
bed_false_var = df_variance['bedrooms'][df_variance.index==False].values[0]
floor_false_var = df_variance['floors'][df_variance.index==False].values[0]
sqft_false_var = df_variance['sqft_living'][df_variance.index==False].values[0]


In [None]:
import numpy as np
# Create a function that calculates p(x | y):
def p_x_given_y(x, mean_y, variance_y):

    # Input the arguments into a probability density function
    p = 1/(np.sqrt(2*np.pi*variance_y)) * np.exp((-(x-mean_y)**2)/(2*variance_y))
    
    # return p
    return p

### Putting together 

Since the denominator of the equation $p(condition)$ is the marginal probability, and sometimes we can ignore the denominator and assume that the posterior is proportional to the denominator, thus

$p(grade|condition) \propto p(grade) p(condition|grade)$

In [None]:
df_nb.head(1)

In [None]:
df_nb.head(1)['bedrooms'].values[0]

In [None]:
## numerator of posterior if classified as True

priors[priors.index==True].values[0][0] * \
p_x_given_y(df_nb.head(1)['bedrooms'].values[0], bed_true_mean, bed_true_var) *\
p_x_given_y(df_nb.head(1)['floors'].values[0], floor_true_mean, floor_true_var) *\
p_x_given_y(df_nb.head(1)['sqft_living'].values[0],sqft_true_mean, sqft_true_var)

In [None]:
## numerator of posterior if classified as false

priors[priors.index==False].values[0][0] * \
p_x_given_y(df_nb.head(1)['bedrooms'].values[0], bed_false_mean, bed_false_var) *\
p_x_given_y(df_nb.head(1)['floors'].values[0], floor_false_mean, floor_false_var) *\
p_x_given_y(df_nb.head(1)['sqft_living'].values[0],sqft_false_mean, sqft_false_var)

Therefore, the $p(condition|grade=F)$ is higher than $p(condition|grade=T)$

<a id='tag5'></a>

## KNN


* For each data point, find the classes for the closesest N neighbours
* Using majority vote to determine the classes

In [None]:
  
# from __future__ import print_function, division
# import numpy as np
# from mlfromscratch.utils import euclidean_distance


# class KNN:
#     def __init__(self, k):
#         self.k = k
#     def _vote( self, neighbour_labels):
#         counts = np.bincount(neighbour_labels.astype('int'))
        
#         return counts.argmax()
    
    
#     def predict(self, X_train, X_test, y_train):
        
#         ## initialize
        
#         y_pred = np.empty(X_test.shape[0])
        
        
#         ## get the neighbour label
        
#         for i, v in enumerate(X_test):
#             ## sort the training samples by their distances
#             idx = np.argsort([euclidean_distance(v, x) for x in X_train])[self.k:]
            
#             ## Extract the labels 
#             k_nearest = np.array([y_train[i] for i in idx])
            
#             y_pred[i] = self._vote(k_nearest)
            
#         return y_pred
            
            