##Linear Regression *(via Sum and Mean)*

In [0]:
import numpy as np
import pandas as pd

In [0]:
def estimate_coeff(x,y):
  n = np.size(x)
  m_x,m_y = np.mean(x),np.mean(y)

  SS_xy = np.sum(y*x) - n*m_y*m_x
  SS_xx = np.sum(x*x) - n*m_x*m_x

  b_1 = SS_xy/SS_xx
  b_0 = m_y - b_1 * m_x

  return(b_0,b_1)

In [0]:
if __name__=='__main__':
  x = np.array([0,1,2,3,4,5,6,7,8,9])
  y = np.array([1,3,2,5,7,8,8,9,10,12])

  b = estimate_coeff(x,y)
  print("Estimated coefficient:\nb_0={} \nb_1 = {}".format(b[0],b[1]))

Estimated coefficient:
b_0=1.2363636363636363 
b_1 = 1.1696969696969697


# Lin Reg *(via Matrix)*


In [0]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

In [0]:
def reshape_data(x):
  x.reshape(-1,1)

In [0]:
def concat_ones(x):
  ones_data = np.ones(shape=x.shape[0]).reshape(-1,1)
  return np.concatenate((ones_data,x),1)

In [0]:
def fit_data(coeff):
  coeff = np.linalg.inv(x.transpose().dot(x)).dot(x.transpose()).dot(y)
  return coeff

In [10]:
if __name__ == "__main__":
  coeff = []
  data = pd.read_csv("/content/sample_data/california_housing_train.csv")
  print(data.head())
  
  x = data.drop(columns=['median_house_value']).values
  y = data['median_house_value'].values

  reshape_data(x)
  print("After Reshaping \n: ",x)

  x = concat_ones(x)
  print("After Concatenation \n: ",x,"\n")
  
  coeff = fit_data(coeff)
  var_count = 0
  for val in coeff:
    print("Coeff ",var_count,": ",val)
    var_count+=1

   longitude  latitude  ...  median_income  median_house_value
0    -114.31     34.19  ...         1.4936             66900.0
1    -114.47     34.40  ...         1.8200             80100.0
2    -114.56     33.69  ...         1.6509             85700.0
3    -114.57     33.64  ...         3.1917             73400.0
4    -114.57     33.57  ...         1.9250             65500.0

[5 rows x 9 columns]
After Reshaping 
:  [[-114.31     34.19     15.     ... 1015.      472.        1.4936]
 [-114.47     34.4      19.     ... 1129.      463.        1.82  ]
 [-114.56     33.69     17.     ...  333.      117.        1.6509]
 ...
 [-124.3      41.84     17.     ... 1244.      456.        3.0313]
 [-124.3      41.8      19.     ... 1298.      478.        1.9797]
 [-124.35     40.54     52.     ...  806.      270.        3.0147]]
After Concatenation 
:  [[ 1.0000e+00 -1.1431e+02  3.4190e+01 ...  1.0150e+03  4.7200e+02
   1.4936e+00]
 [ 1.0000e+00 -1.1447e+02  3.4400e+01 ...  1.1290e+03  4.6300e+02
 

# Linear Regression *(via Numpy Package)*

In [0]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

In [9]:
if __name__ == "__main__":
  x1= data.drop(columns=['median_house_value']).values
  x1 = np.array(x).T
  y1 = data['median_house_value'].values  
  
  est = np.linalg.lstsq(x,y, rcond=None)[0]
  
  var_count = 0
  for val in est:
    print("EST ",var_count,": ",val)
    var_count+=1

EST  0 :  -3620600.892974251
EST  1 :  -43139.63725758856
EST  2 :  -42925.67308831491
EST  3 :  1150.6949324743741
EST  4 :  -8.378251213194744
EST  5 :  117.64854284208832
EST  6 :  -38.48877209597049
EST  7 :  45.43600255224828
EST  8 :  40507.06835939737


# KNN

In [0]:
import pandas as pd
import numpy as np
import operator

from google.colab import drive
drive.mount('/content/drive')

In [0]:
def euc_distance(x1, x2, length): 
    distance = 0
    for x in range(length):
        distance += np.square(x1[x] - x2[x])
    return np.sqrt(distance)

In [0]:
def knn (train,test,k):
  distance = {}

  #find no. of col
  length = test.shape[1]

  for x in range(len(train)):
    dist = euc_distance(test,train.iloc[x],length)
    distance[x] = dist[0]
  sort_dist = sorted(distance.items(),key=operator.itemgetter(1))

  #Place index of col to start w/
  neighbors = []
  for x in range(k):
    neighbors.append(sort_dist[x][0])
  
  #Calc freq class of rows
  Votes = {}
  for x in range(len(neighbors)):
    #get last col
    resp = train.iloc[neighbors[x]][-1]

    if resp in Votes:
      Votes[resp] +=1
    else:
      Votes[resp] = 1
  
  #Append variety to dict w/count
  sort_votes = sorted(Votes.items(),key=operator.itemgetter(1),reverse=True)
  
  return(sort_votes[0][0],neighbors)

In [194]:
if __name__ == "__main__":
  data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/iris.csv")
  
  test = [[5.1,3.5,1.4,.4]]
  test = pd.DataFrame(test)

  res,n = knn(data,test,6)
  print(res)
  print(n)
  

Setosa
[17, 40, 0, 21, 4, 26]


# NB Classifier (via sklearn)

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB,GaussianNB
from sklearn.metrics import classification_report,confusion_matrix

In [0]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/iris.csv")

#Save class type and drop
result = data['variety']
data=data.drop(columns=['variety'])  

In [0]:
x_train,x_test,y_train,y_test = train_test_split(data,result,train_size=0.7)

In [128]:
x_train.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
84,5.4,3.0,4.5,1.5
81,5.5,2.4,3.7,1.0
110,6.5,3.2,5.1,2.0
107,7.3,2.9,6.3,1.8
140,6.7,3.1,5.6,2.4


In [129]:
y_train.head()

84     Versicolor
81     Versicolor
110     Virginica
107     Virginica
140     Virginica
Name: variety, dtype: object

In [0]:
bnb = BernoulliNB()
bnb.fit(x_train,y_train)
y_pred = bnb.predict(x_test)

In [150]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

      Setosa       0.00      0.00      0.00        19
  Versicolor       0.24      1.00      0.39        11
   Virginica       0.00      0.00      0.00        15

    accuracy                           0.24        45
   macro avg       0.08      0.33      0.13        45
weighted avg       0.06      0.24      0.10        45



  _warn_prf(average, modifier, msg_start, len(result))


In [151]:
print(confusion_matrix(y_test,y_pred))

[[ 0 19  0]
 [ 0 11  0]
 [ 0 15  0]]


In [153]:
#print(y_test)
#print(y_pred)
bnb.predict_proba(x_test)

array([[0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.29125667, 0.37543882, 0.33330451],
       [0.

#NB Classifier

In [0]:
import pandas as pd
import random as rd
import math
import csv

In [0]:
def split_test(data,split_ratio):
  train_size = int(len(data)* split_ratio)
  train_set = []
  tmp = list(data)

  while len(train_set) < train_size:
    index = random.randrange(len(tmp))
    train_set.append(tmp.pop(index))
  return[train_set,copy]


In [0]:
def sep_class(data):
  sep = []

  for i in range(len(data)):
    vec = data.iloc[i].values

In [188]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/iris.csv")
dataset = csv.reader(open(r"/content/drive/My Drive/Colab Notebooks/iris.csv"))
dataset = list(dataset)
print(tmp)
vec = data.iloc[0].values
vec1 = dataset[1]
print(vec)
print(vec1)

['sepal.length', 'sepal.width', 'petal.length', 'petal.width', 'variety']
[5.1 3.5 1.4 0.2 'Setosa']
['5.1', '3.5', '1.4', '.2', 'Setosa']
