<a href="https://colab.research.google.com/github/reagenhuskey/cs290/blob/main/partnerProject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

In [None]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [None]:
penguins.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
      dtype='object')

In [None]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [None]:
penguins = penguins.dropna()

X = penguins[['island','bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']] # create a new dataframe using only the selected features
y = penguins['species'] # target

In [None]:
print(y.value_counts())
print(len(y))  # priors calculation check

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64
333


In [None]:
encoder = LabelEncoder()

In [None]:
X['island'] = encoder.fit_transform(X['island'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['island'] = encoder.fit_transform(X['island'])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
priors = penguins["species"].value_counts( normalize=True ) # By normalizing, value count is converted to proportion since it divides all values by the sum of values.
priors

Unnamed: 0_level_0,proportion
species,Unnamed: 1_level_1
Adelie,0.438438
Gentoo,0.357357
Chinstrap,0.204204


In [None]:
new_penguin = {
    'island': 'Torgersen',
    'bill_length_mm': 39.1,
    'bill_depth_mm': 18.7,
    'flipper_length_mm': 181.0,
    'body_mass_g': 3750.0
}

# initialize a new instance

In [None]:
new_penguin['island'] = encoder.transform([new_penguin['island']])[0] # transform island, using [0] to extract just the numerical value.

# Implemented Naive Bayes

In [None]:
def find_priors(y):
    return y.value_counts( normalize=True ) # calculate prior probailities with normalize since it divides each value count by the total number of elements in y. Basically,
                                            # it gives the rate of occurences of each value.

In [None]:
def find_likelihoods(X, y):
  likelihood = {}
  for f in X.columns: # loop through each feature
    likelihood[f] = {}
    for c in y.unique(): # loop through each unique class/species.
      if f != 'island': # numeric features
        mean = X[y == c][f].mean() # filters to select only rows where y is equal to class c, then calculate the mean of the feature for those rows.
        std = X[y == c][f].std() # similar but calculates standard deviation instead
        likelihood[f][c] = lambda x, mean=mean, std=std: (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-((x - mean)**2 / (2 * std**2))) # determine likelihood
      else: # categorical features
        value_counts = X[y == c][f].value_counts(normalize=True) # filters to only rows where y is equal to class c, then selects 'island' corresponding to c. Then, show relative frequencies.
        likelihood[f][c] = value_counts.to_dict() # stores results in likelihood dictionary
  return likelihood



In [None]:
def nbc_predict(instance, priors, likelihoods):
  classes = priors.keys()
  posteriors = {}

  for c in classes:
    posteriors[c] = priors[c] # for each class, init posterior to be the prior prob.
    for f, value in instance.items():
      if f != 'island': # if feature is numeric
        posteriors[c] *= likelihoods[f][c](value) # multiply the likelihood with the current posterior
      elif value in likelihoods[f][c]: # if categorical
        posteriors[c] *= likelihoods[f][c][value] # multiply posterior by the probability inside the likelihoods dictionary.

  return max(posteriors, key=posteriors.get) # select the highest posterior probability


In [None]:
def naive_bayes(X, y, new_instance):  # finds posterior probability of each class given the observed features then determines the most probable class.
  priors = find_priors(y)
  likelihoods = find_likelihoods(X, y)
  return nbc_predict(new_instance, priors, likelihoods)

*bayes theorem, know how bayes theorem works, know priors, likelihoods, etc. how to calculate those things.*

# Predictions

In [None]:
prediction = naive_bayes(X, y, new_penguin)
print(f"Predicted species: {prediction}")

Predicted species: Adelie


In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
predicted_species = gnb.predict(pd.DataFrame([new_penguin]))[0]
print(f"Predicted species: {predicted_species}")

Predicted species: Adelie
