# Dataset

1. load csv file (panda, numpy)
2. split dataset. Example code:()
   ```
   random.shuffle(data) # change if you are using pandas dataframe
   training = data[:int(len(data)*0.8)]
   test = data[int(len(data)*0.8):]

   fold5 = KFold(5) # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
   for train_idx, val_idx in fold5.split(training):
      sub_val = training[val_idx]
      sub_train = training[train_idx]
      clf = model(sub_train, sub_val, ...) # training the model, and evaluate it on validation dataset
      performance(clf, test) # test the model on test dataset
   ```

#Naive bayes

1. model learning:

   Note:

   features: remove attributes that is not related to word (the last four attributes)

   labels: the last column

   count P(c) -> how many samples are positive, and how many are negtive

   if freq_word>0, then this word exists. You could use this to calculate P(a|c) -> for each class, what is the prob of each word

   remember to use laplace smoothing.

2. model evaluation (on val dataset -> performance(model, val)):
   
   for each new sample, $\prod{P(a|c)}P(c)$ if word is in the email(freq_word > 0); and find the maximum class
   

   

# KNN
1. model learning: None

2. model evaluation(on val dataset): You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.

   ```
   Note:
   parallel programing
   numpy.cos() to calcuate the similarity
   ```

# LR

1. model learning: You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.
    
    $y = sigmoid(MX)$

step 1: add one more column (all value is 1) in X -> X' = np.c_[np.ones((len(X), 1)), X]

step 2:vector M = np.random.randn(len(X[0])+1, 1);

key formula for step 3 (Note: n is the size of the TRAINING dataset; $cdot$ is dot production ):

1. $pred_y = sigmoid(M\cdot X')$

2. $loss = -\sum(y\cdot log(pred_y)+(1-y)\cdot log(1-pred_y))/n$

3. $gm=X'\cdot (pred_y - y)*2/n$

Step 3 example code:
   ```
   #Step 3: performing gradient descent on whole dataset:
   best_model = M
   best_performace = 0
   for i in range(epoch):
     pred_y = ...
     gm = ...
     _p = performace(model, val)
     if _p > best_performance:
        best_model = M
        best_performance = _p
     M = M - learning_rate*gm
   ```

2. model evaluation(on val dataset):
  
   calculate pred_y, if more than 0.5, then the predicted label is 1.

# Model Evaluation

https://scikit-learn.org/stable/modules/model_evaluation.html

In [4]:
# Importing necessary libraries
# For data manipulation and analysis
import pandas as pd
 # For numerical computing
import numpy as np
# For handling file input/output
import io
 # For uploading files in Google Colab
from google.colab import files

# Function to split data into k folds for cross-validation
def k_fold_split(data, k):
    # Initializing an empty list to store folds
    folds = []
    # Calculating the size of each fold
    fold_size = len(data) // k
    # Iterating over the number of folds
    for i in range(k):
        # Determining the start and end indices for the current fold
        start = i * fold_size
        end = (i + 1) * fold_size if i < k - 1 else len(data)
        # Creating validation fold
        val_fold = data[start:end]
        # Creating training fold by concatenating data before and after the validation fold
        train_fold = pd.concat([data[:start], data[end:]])
        # Appending the tuple of train and validation folds to the list of folds
        folds.append((train_fold, val_fold))
    return folds

# Function to calculate accuracy
def accuracy_score(true_labels, predictions):
    # Calculating accuracy by comparing true labels with predicted labels
    return sum(1 for true, pred in zip(true_labels, predictions) if true == pred) / len(true_labels)

# Naive Bayes model class
class NaiveBayes:
    def __init__(self):
        # Initializing dictionaries to store class probabilities and word given class probabilities
        self.class_probs = {}
        self.word_given_class_probs = {}

    def train(self, X, y):
        # Calculating class counts
        class_counts = y.value_counts().to_dict()
        # Calculating total number of samples
        total_samples = len(y)
        # Calculating class probabilities
        self.class_probs = {cls: count / total_samples for cls, count in class_counts.items()}
        # Calculating word given class probabilities
        for cls in class_counts:
            sub_X = X[y == cls]
            word_counts = sub_X.sum(axis=0) + 1
            total_counts = word_counts.sum()
            self.word_given_class_probs[cls] = word_counts / total_counts

    def predict(self, X):
        # Initializing list to store predicted results
        results = []
        # Iterating over each sample in the dataset
        for _, sample in X.iterrows():
            # Initializing dictionary to store scores for each class
            class_scores = {}
            # Calculating score for each class
            for cls, probs in self.word_given_class_probs.items():
                sample_probs = sample[sample > 0]
                log_probs = np.log(probs[sample_probs.index]) * sample_probs
                class_scores[cls] = np.sum(log_probs) + np.log(self.class_probs[cls])
            # Appending the class with maximum score to the results list
            results.append(max(class_scores, key=class_scores.get))
        return results

# Logistic Regression model class
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        # Initializing learning rate and number of iterations
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        # Initializing weights and bias
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        # Sigmoid function to map any real-valued number to a value between 0 and 1
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        # Getting the number of samples and features
        num_samples, num_features = X.shape
        # Initializing weights as zeros
        self.weights = np.zeros(num_features)
        # Initializing bias as zero
        self.bias = 0

        # Training the model
        for _ in range(self.num_iterations):
            # Calculating linear model
            linear_model = np.dot(X, self.weights) + self.bias
            # Calculating predictions using sigmoid function
            predictions = self.sigmoid(linear_model)

            # Calculating gradients
            dw = (1 / num_samples) * np.dot(X.T, (predictions - y))
            db = (1 / num_samples) * np.sum(predictions - y)

            # Updating weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        # Calculating linear model
        linear_model = np.dot(X, self.weights) + self.bias
        # Calculating predictions using sigmoid function
        predictions = self.sigmoid(linear_model)
        # Thresholding predictions to get binary values
        return [1 if i > 0.5 else 0 for i in predictions]

# KNN model class
class KNN:
    def __init__(self, k):
        # Initializing number of neighbors
        self.k = k

    def fit(self, X, y):
        # Storing training data
        self.X_train = X
        self.y_train = y

    def predict(self, X_val):
        # Initializing list to store predictions
        predictions = []
        # Iterating over each sample in the validation set
        for _, x in X_val.iterrows():
            # Calculating cosine similarity between current sample and training data
            similarities = self.cosine_similarity(x, self.X_train)
            # Getting indices of k nearest neighbors
            k_nearest_indices = np.argsort(similarities)[-self.k:]
            # Getting labels of k nearest neighbors
            k_nearest_labels = [self.y_train.iloc[i] for i in k_nearest_indices]
            # Predicting the class based on majority vote
            predictions.append(max(set(k_nearest_labels), key=k_nearest_labels.count))
        return predictions

    def cosine_similarity(self, x, X_train):
        # Calculating dot product between current sample and training data
        dot_product = np.dot(X_train, x)
        # Calculating norms
        norm_x = np.linalg.norm(x)
        norm_X_train = np.linalg.norm(X_train, axis=1)
        # Calculating cosine similarity
        return dot_product / (norm_x * norm_X_train)

# Function to evaluate models and print results
def evaluate_model(model, X_val, y_val, model_name):
    # Predicting labels for validation set
    predictions = model.predict(X_val)
    # Calculating accuracy
    accuracy = accuracy_score(y_val, predictions)
    # Printing validation accuracy
    print(f"{model_name} Validation Accuracy: {accuracy}")

# Step 1: Upload the file from your local machine
uploaded = files.upload()

# Dynamically get the filename from the uploaded files dictionary
filename = next(iter(uploaded))

# Step 2: Load the dataset into a pandas DataFrame
data = pd.read_csv(io.BytesIO(uploaded[filename]))

# Step 3: Display the first few rows of the DataFrame to confirm it's loaded correctly
print(data.head())

# Step 4: Shuffle the dataset to ensure randomness
data = data.sample(frac=1, random_state=42)

# Step 5: Split the data into features (X) and target labels (y)
 # Features
X = data.iloc[:, :-1]
 # Target labels1
y = data.iloc[:, -1]

# Step 6: Initialize 5-fold cross-validation
k = 5
folds = k_fold_split(data, k)

# Step 7: Loop for cross-validation
# Initialize the Naive Bayes model
nb = NaiveBayes()
# Initialize the Logistic Regression model
lr = LogisticRegression()
# Initialize the KNN model with k=5 neighbors
knn = KNN(k=5)

for train_fold, val_fold in folds:
    X_train, y_train = train_fold.iloc[:, :-1], train_fold.iloc[:, -1]
    X_val, y_val = val_fold.iloc[:, :-1], val_fold.iloc[:, -1]

    # Train and evaluate the Naive Bayes model
    nb.train(X_train, y_train)
    evaluate_model(nb, X_val, y_val, "Naive Bayes")

    # Train and evaluate the Logistic Regression model
    lr.fit(X_train, y_train)
    evaluate_model(lr, X_val, y_val, "Logistic Regression")

    # Train and evaluate the KNN model
    knn.fit(X_train, y_train)
    evaluate_model(knn, X_val, y_val, "KNN")


Saving spambase.csv to spambase (3).csv
   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  char_freq_;  char_freq_(  \
0             0.00            0