In [1]:
# load libraries
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
import numpy as np

# ML Stuff

### Supervised Machine Learning
- Training, Validation, Testing datasets
    - Training: used to train the model
    - Validation: used to tune the hyperparameters
        - Hyperparameters: parameters that are not learned by the model
        - modern models often handle this automatically
        - terminology has evolved so older sources may say validation but mean testing data
    - Testing: used to evaluate the model
- Cross Validation
    - iteratively train and test the model on different subsets of the data
    - allows you to use all of the data for training and testing without overfitting (hopefully)
    - Leave-One-Out Cross Validation (LOOCV)
        - train on all but one data point
        - test on the one data point
        - repeat for all data points
        - pros: uses all data for training and testing
        - cons: computationally expensive and can lead to overfitting
        - **Should not be used**
    - k-fold Cross Validation
        - split data into k subsets
        - train on k-1 subsets
        - test on the remaining subset
        - repeat for all subsets
        - pros: computationally efficient
        - cons: uses less data for training and testing
  ### We will use 2-fold cross validation for this course

### USeful Python Libraries
- NumPy
    - good for linear algebra
- scikit-learn
    - good for machine learning
- pandas
    - good for data manipulation

In [2]:
#load dataset
url = "files/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length',
'petal-width', 'class']
dataset = read_csv(url, names=names)

FileNotFoundError: [Errno 2] No such file or directory: 'files/iris.csv'

In [None]:
# print(dataset.head(20))

Most scikit-learn library functions use the following convention:
- X is an array containing all the features in the first columns and the class in the last column.
- y is an array containing only the classes.
- Note: Test_size must be set to 0.50 for 2-fold cross-validation which we will be using in this class.

In [None]:
#Create Arrays for Features and Classes
array = dataset.values
X = array[:,0:4] #contains flower features (petal length, etc..)
y = array[:,4] #contains flower names
#Split Data into 2 Folds for Training and Test
X_Fold1, X_Fold2, y_Fold1, y_Fold2 = train_test_split(X, y, test_size=0.50, random_state=1)

In [None]:
model = GaussianNB() #create model of type Gaussian Naive Bayes
model.fit(X_Fold1, y_Fold1)  #train model on Fold1
pred1 = model.predict(X_Fold2)  #test model on Fold2
model.fit(X_Fold2, y_Fold2)  #train model on Fold2
pred2 = model.predict(X_Fold1)  #test model on Fold1

### Evaluating the Model
- used to quantify
    - desired performance vs actual performance
    - desired vs baseline performance
    - progress over time
- Accuracy
    - number of correct predictions / total number of predictions
    - good for balanced datasets
    - bad for unbalanced datasets
- Confusion Matrix
    - shows the number of correct and incorrect predictions
    - good for unbalanced datasets
    - at it's most basic, made up of 4 values
        - true positives (TP)
        - true negatives (TN)
        - false positives (FP)
        - false negatives (FN)
        - FP and FN are often called Type I and Type II errors
        - <img src="images/FP_and_FN.png" alt="drawing" width="500"/>
    - accuracy, precision, recall, and F1 score can be calculated from the confusion matrix
        - accuracy = (TP + TN) / (TP + TN + FP + FN)
                - how often the model is correct
        - precision = TP / (TP + FP)
                - how often the model is correct when it predicts positive
        - recall = TP / (TP + FN)
                - how often the model predicts positive when it is correct
        - F1 score = 2 * (precision * recall) / (precision + recall)
                - harmonic mean of precision and recall
                - good for unbalanced datasets
    - F-Score
        - F-Score or F-measure is used in statistical analysis of binary classification
        - F-Score is the harmonic mean of precision and recall
        - highest possible value is 1.0
        - lowest possible value is 0.0

### Multiclass Confusion Matrices
- confusion matrices can be extended to multiclass problems
    - e.g.
        - <img src="images/multiclass.png" alt="drawing" width="500"/>
        - precisoin of cat is from the horizontal cat row, 4/13
        - recall of cat is from the vertical cat column, 4/6
- there is no standard orientation of the matrix
    - i.e. the predicted and true labels can be on the rows or columns
    - so always read the labels
    - the diagonal is always the true positives
-


In [None]:
actual = np.concatenate([y_Fold2, y_Fold1])  #combine the actual labels from both folds
predicted = np.concatenate([pred1, pred2])   #combine the predicted labels from both folds
print(f"Accuracy: {accuracy_score(actual, predicted)}")   #print the accuracy
print("Confusion Matrix:")   #print the confusion matrix
print(confusion_matrix(actual, predicted))   #print the confusion matrix
print("Classification Report:")   #print the classification report
print(classification_report(actual, predicted))   #print the classification report

### Regression Classifiers
- linear regression
    - single input variable
    - $y = b_0 + b_1x$
        - $y$ is the response
        - $b_0$ is the bias coefficient
        - $b_1$ is the coefficient for the input variable
    - training data is used to find the values of the coefficients
        - finding the best fit line
        - many different algorithms can be used to find the best fit line
            - ordinary least squares
            - gradient descent
            - stochastic gradient descent
            - etc...
    - once the coefficients are found, the model can be used to make predictions
        - $y = 0.5 + 0.8x$
        - $y = 0.5 + 0.8(5)$
        - $y = 4.5$ 
- polynomial regression
    - nonlinear relationship between the input and response
        - $y = b_0 + b_1x + b_2x^2 +$ ...    
    - formulas are typically represented as matrices
- multiple linear regression
    - multiple input variables
    - $y = b_0 + b_1x_1 + b_2x_2 +$ ...
    - formulas are typically represented as matrices 

In [None]:
x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x, y = np.array(x), np.array(y)
# print(x)
# print(y)
from sklearn.linear_model import LinearRegression
model = LinearRegression() #create model of type Linear Regression
model.fit(x, y)  #train model on data
# model = LinearRegression().fit(x, y) # oneliner for the above 2 lines

### Evaluating Regression Models
- $R^2$ is a measure of the fit
- can be obtained with `.score()`

In [None]:
model.score(x, y)
print('B_0:', model.intercept_)
print('[B_1 B_2]:', model.coef_)

### Regression
- Strengths
    - straightforward to understand and explain
    - can be regularized to avoid overfitting
    - easily updated with new data via gradient descent
- Weaknesses
    - assumes a linear relationship between the input and response
        - performs poorly with nonlinear relationships
    - not flexible enough to capture more complex relationships
        - e.g. polynomial regression 

### In Class 29Aug23

1. a) Polynomial regression, the data clearly does not follow a straight line
1. b) Linear regression, the data follows a straight line
2.  
- $y = 0.2 + 0.1x_1 + 0.05x_2$
- $x_1 = 5.1$
- $x_2 = 1.8$
- $y = 0.2 + 0.1(5.1) + 0.05(1.8)$
- $y = 0.2 + 0.51 + 0.09$
- $y = 0.8$
- The model predicts Iris-setosa

#### scikit-learn Algorithm for Regression
- needed for assignment 2
- doesn't work without other code (as of 29Aug23)

In [None]:
def regModel(name, model):
    #Fit and transform data sets according to the regression degree
    poly_reg = None
    if (name == "Linear Regression"):
        poly_reg = PolynomialFeatures(degree=1)
    elif(name == "2 Degree Polynomial Regression"):
        poly_reg = PolynomialFeatures(degree=2)
    elif(name == "3 Degree Polynomial Regression"):
        poly_reg = PolynomialFeatures(degree=3)
    #create 2 folds
    X_Poly1 = poly_reg.fit_transform(X_Fold1)
    X_Poly2 = poly_reg.fit_transform(X_Fold2)

In [None]:
model.fit(X_Poly1, y_Fold1) #first fold training
pred1 = model.predict(X_Poly2).round() #first fold testing
#regression may produce values < 1 or > 3.
pred1 = np.where(pred1 >= 3.0, 2.0, pred1) #map all values > 3 to 2
pred1 = np.where(pred1 <= -1.0, 0.0, pred1) #map all values < 0 to 0
model.fit(X_Poly2, y_Fold2) #second fold training
pred2 = model.predict(X_Poly1).round() #second fold testing
pred2 = np.where(pred2 >= 3.0, 2.0, pred2)
pred2 = np.where(pred2 <= -1.0, 0.0, pred2)
actual = np.concatenate([y_Fold2, y_Fold1])
predicted = np.concatenate([pred1, pred2])

### Naive Bayesian Classifiers
- simplest ML classifier
- gold standard for comparing other classifiers
    - if a new classifier is not better than a naive bayesian classifier, it is not worth using 
- based on Bayes' Theorem of conditional probability
    - $P(A|B) = \frac{P(B|A)P(A)}{P(B)}$
    - $P(A|B)$ is the probability of A given B
    - $P(B|A)$ is the probability of B given A
    - $P(A)$ is the probability of A
    - $P(B)$ is the probability of B
- mean and variance are used to summarize the data
    - mean  $\mu$
         - the average
        - $\mu = \frac{1}{n}\sum_{i=1}^{n}x_i $
    - variance $\sigma^2$
        - how much the data varies from the mean
        - $\sigma^2 = \frac{1}{n}\sum_{i=1}^{n}(x_i - \mu)^2 $
- NB Classifiers are conditional probability models
    - a sample to be classified is represented as a vector of features
        - $x = (x_1, x_2, x_3, ..., x_n)$
    - calculates the conditional probability of each class given the features
        - $P(C_k|x_1, x_2, x_3, ..., x_n)$
    - the class with the highest probability is the predicted class
- problem
    - if the number of features is large, classification by conditional probability is infeasible
    - thus the model is reformulated to be more tractable
        - the denominator is removed because it is effectively a constant
- reduced form
    - posterior numerator
        - posterior numerator = prior * likelihood
        - can estimate $p(x_k|C_i)$ from the training data
            - $p(x_k|C_i) = \frac{1}{\sqrt{2\pi\sigma_{ik}^2}}e^{-\frac{(x_k - \mu_{ik})^2}{2\sigma_{ik}^2}}$
            - $x_k$ is the value of feature k in the sample
            - $\mu_{ik}$ is the mean of feature k for the entire training set
            - $\sigma_{ik}^2$ is the variance of feature k for the entire training set
            - $C_i$ is the class
            - $e$ is Euler's number (2.71828...)

### Summary of Naive Bayes
- Strengths
    - simple and easy to implement
    - fast
    - good for high dimensional data
    - good for categorical data
    - good for text classification
- Weaknesses
    - assumes independence of features
    - assumes a gaussian distribution of features
    - based on probability theory
        - real world data is often more complex
    - can be outperformed by other classifiers
- Training
    - calculate one probability for each class
    - calculate n * m conditional probabilities
        - n is the number of class
        - m is the number of features 

### In Class 31Aug23
- Given:
    - The iris data set contains 150 samples of data, 50 for each variety of iris: Iris-setosa, Irisversicolor & Iris-virginica
    - We will use 149 samples of the data to train the classifier, and test it with one sample of Irisvirginica which has the following features:
        - sepal-length = 5.9
        - sepal-width = 3
        - petal-length = 5.1
        - petal-width = 1.8

1. posterior numerator each class
    - posterior numerator(Iris-setosa) = $p(Iris-setosa) \cdot p(sepal-length|Iris-setosa) \cdot p(sepal-width|Iris-setosa) \cdot p(petal-length|Iris-setosa) \cdot p(petal-width|Iris-setosa)$
    - posterior numerator(Iris-versicolor) = $p(Iris-versicolor) \cdot p(sepal-length|Iris-versicolor) \cdot p(sepal-width|Iris-versicolor) \cdot p(petal-length|Iris-versicolor) \cdot p(petal-width|Iris-versicolor)$
    - posterior numerator(Iris-virginica) = $p(Iris-virginica) \cdot p(sepal-length|Iris-virginica) \cdot p(sepal-width|Iris-virginica) \cdot p(petal-length|Iris-virginica) \cdot p(petal-width|Iris-virginica)$
2. P for each class
    - P(Iris-setosa) = 50/150 = 0.333
    - P(Iris-versicolor) = 50/150 = 0.333
    - P(Iris-virginica) = 50/150 = 0.333
3. given mean 5.0 and variance 0.12 for p(sepal-length } Iris-setosa) fill in the formula
    - $p(x_k|C_i) = \frac{1}{\sqrt{2\pi\sigma_{ik}^2}}e^{-\frac{(x_k - \mu_{ik})^2}{2\sigma_{ik}^2}}$ 
    - $p(sepal-length|Iris-setosa) = \frac{1}{\sqrt{2\pi(0.12)^2}}e^{-\frac{(5.9 - 5.0)^2}{2(0.12)^2}}$
    - $p(sepal-length|Iris-setosa) = 2.0286 \cdot 10^{-12}$
4. how many conditional probabilities are there?
    - 4 features * 3 classes = 12 conditional probabilities
5. $posterior numerator(Iris-setosa) = 0.005, posterior numerator(Iris-versicolor) = 0.002, posterior numerator(Iris-virginica) = 0.003$, which variety of iris is the sample most likely to be?
    - Iris-setosa because it has the highest posterior numerator

### Discriminant Analysis Classifiers
- <img src="images/discrimination_analysis.png" alt="drawing" width="500"/>
- linear and quadratic discriminant analysis
    - two classic classifiers
    - provide closed-form solutions which are easy to compute
    - inherently multiclass
    - proven to work well in practice
    - do not have hyperparameters to tune
- Variance 
    - how far a set of random numbers are spread from their average value
- Covariance
    - joint variability of two random variables (two features)
    - measures interdependence between two features
    - if the covariance is positive, the two features increase together
    - if the covariance is negative, the two features move in opposite directions
        - e.g. as one feature increases, the other decreases
    - magnitude of the covariance is not normalized so it is hard to interpret
        - the normalized version is called the correlation coefficient
            - correlation coefficient is always between 0 and 1

#### shrinkage function
- from scikit-learn
- a tool to improve the covariance matrix estimate when the number of samples is small compared to the number of features
- set shrinkage parameter of discriminant_analysis.LinearDiscriminantAnalysis to 'auto'
    - automatically determines the optimal shrinkage parameter
- 

#### LDA and QDA
- predictions can be derived from Bayes' Theorem
- LDA
    - assumes the covariance of the features is the same for each class
    - assumes the distribution of each class is normal
- QDA
    - assumes the covariance of the features is different for each class
    - assumes the distribution of each class is normal
- Both
    - assume the features are statistically independent of each other
    - use Bayes' Theorem to calculate the probability of each class
    - use the class with the highest probability as the predicted class
- Naive Bayes is a simplified version of the LDA formula
    - covariance matrix is assumed to be diagonal
        - i.e. the features are independent of each other
#### Model Comparison
- LDA
    - advantages
        - closed-form solution
        - inherently multiclass
        - proven to work well in practice
        - no hyperparameters to tune
        - use covariance to account for correlation between features
    - disadvantages
        - assumes the distribution of each class is gaussian
        - assumes the covariance of the features is the same for each class
        - can only learn linear boundaries
- QDA
    - advantages
        - closed-form solution
        - inherently multiclass
        - proven to work well in practice
        - no hyperparameters to tune
        - use covariance to account for correlation between features
    - disadvantages
        - assumes the distribution of each class is gaussian
        - assumes the covariance of the features is different for each class
        - can only learn linear boundaries  

### k-Nearst Neighbors (kNN) Classifiers
- model representation is the entire training dataset on a graph
- no  model required, just store the training data
- easy to update by adding new training data or removing old bad data
- lazy learner
    - does not learn a discriminative function from the training data
    - makes predictions based on the entire training dataset
    - computationally expensive
- works by finding the k closest training samples in the feature space
    - the predicted class is the most common class among the k nearest neighbors
        - e.g. if k = 3 and the 3 nearest neighbors are 2 Iris-setosa and 1 Iris-versicolor, the predicted class is Iris-setosa
    - the distance between samples is calculated using a distance metric
        - e.g. Euclidean distance, Hamming, Manhattan distance, Minkowski distance, etc...
- Euclidean distance
    - distance between two points
    - good if the input variables are similar in type
        - e.g. all measured widths and heights
- Hamming distance
    - distance between binary vectors
- Manhattan distance
    - distance between two points measured along axes at right angles
    - good if the input variables are not similar in type
        - e.g. age, gender, height, etc...
- Minkowski distance
    - generalization of Euclidean and Manhattan distance
- varying the k value can affect the performance of the model
    - small k values can lead to overfitting
    - large k values can lead to underfitting
    - k should be odd if there are an even number of classes or even if there are an odd number of classes
        - e.g. if there are 3 classes, k should be odd
        - e.g. if there are 4 classes, k should be even
        - ties can be broken by expanding k by 1 and looking at the class of the next nearest neighbor

#### Curse of Dimensionality
- kNN works well for small numbers of features (dimensions) but not for large numbers of features
    - each feature adds a dimension to the feature space
    - as the number of features increases, the number of samples required to maintain accuracy increases exponentially
    - in high dimensions, points that may be similar may have very large distances between them
#### General
- rescaling data
    - kNN works well for data that is in the same scale
- address missing data 
    - missing data increases the distance between samples
- lower dimensionality
    - remove features that are not important
    - reduce the number of features using PCA
- advantages
    - simple and easy to implement
    - easy to update
    - makes no assumptions about the distribution or independence of the data
- disadvantages
    - memory intensive
    - performs poorly on high dimensional data
    - requires a meaningful distance function to calculate similarity 

### In Class 05Sep23
1. Matrix B because the covariances of all features are 0 except with themselves, and C because the covariances are all nearly 0 except with themselves
2. Matrix A because the covariance of feature c with a and b are nearly 0 but the covariance of a and b are slightly higher
3. Matrix A because the covariance of feature a and b are the highest of the 3 matrices
4. $d = \sqrt{(3-1)^2+(8-4)^2+(2-8)^2} \approx 7.48$
5. cat
6. because the 3 closest training data points are two cats and a dog

### Support Vector Machines (SVM) Classifiers
- SVMs are a set of supervised learning methods used for classification, regression, and outliers detection
- plots data and draws a line between classes
    - the line is called a hyperplane
    - the hyperplane is the decision boundary
- assumes that like things are in the same class
    - i.e. they clump together
- scikit-learn
    - svm.LinearSVC
        - linear SVM
        - uses one-vs-rest
        - can be used for multiclass classification
    - svm.NuSVC
        - non-linear SVM
        - uses one-vs-one
        - can be used for multiclass classification
    - svm.SVC
        - non-linear SVM
        - uses one-vs-one
        - can be used for multiclass classification
        - uses a kernel trick to transform the data into a higher dimension
            - e.g. from 2D to 3D
            - allows the data to be separated by a hyperplane
            - the kernel trick is computationally expensive
    - SVC vs NuSVC
        - different kernels
        - different regularization parameters
        - if the class can be divided by a straight line
            - use LinearSVC or SVC/NuSVC with a linear kernel
        - if the class cannot be divided by a straight line
            - use SVC/NuSVC with a non linear kernel
- strengths
    - non-probabilistic
        - i.e. no assumption of gaussian distribution or feature independence
    - effective when features > samples
    - use subset of training points in decision function
        - support vectors
    - versatile
        - different kernel functions can be specified for the decision function
- weaknesses
    - over-fitting when features >> samples
    - kernel selection is critical


### Decision Trees
- good when boundaries between classes are non-linear
- non probabilistic because they don't assume specific feature dependence or independence
- you can make multivariate decision trees
    - they don't really improve accuracy and can reduce it
- weaknesses
    - tend to overfit
    - can be unstable
        - small changes in data may generate a completely different tree
    - impossible to determine the globally "optimal" tree structure
### Random Forests
-  created to mitigate the weaknesses of decision trees
- ensemble learning method
    - combines multiple decision trees to produce a better classifier
    - each tree is trained on a random subset of the training data
    - majority vote from the forest is used to determine the predicted class
- extremely randomized trees
    - similar to random forests
    - splits are chosen at random
    - splits are chosen from the entire feature set
    - splits are chosen to be the best possible split
    - splits are chosen to be the best possible split from a random subset of the feature set
- random forests and extremely randomized trees are effective because
    - they reduce overfitting
    - they reduce variance
    - they improve accuracy
    - they are computationally efficient

### In Class 07Sep23
- 1. a) LinearSVC because the classes can be easily split by a straight line
- 1. b) SVC/NuSVC because the classes cannot be easily split by a straight line
- 2. R1: IF (age<30) AND (student = no) THEN no
- 2. R2: IF (age<30) AND (student = yes) THEN yes
- 2. R3: IF (30<age<40) THEN yes
- 2. R4: IF (40<age) AND (credit_rating = excellent) THEN yes
- 2. R5: IF (40<age) AND (credit_rating = fair) THEN no
- 3.                 | Taxable Income |
                    /         |        \
                  <85k      85k-95k   >95k
                   |          |         |
                  (no)      (yes)      (no)