In [None]:
# 1st cell: Import necessary libraries
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For plotting and visualizations
import seaborn as sns  # For statistical data visualization
from sklearn.model_selection import StratifiedKFold, cross_val_score  # For stratified K-fold cross-validation
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.tree import DecisionTreeClassifier  # Decision Tree model
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors model
from sklearn.naive_bayes import GaussianNB  # Gaussian Naive Bayes model

# 2nd cell
penguins_data = pd.read_csv('Assignment 8.csv')  
penguins_data.head()  

# 3rd cell
penguins_data.info()
penguins_data.species.value_counts()  
penguins_data.isnull().sum()  


for column_name in penguins_data.select_dtypes(include='float', exclude='object'):
    penguins_data[column_name] = penguins_data[column_name].fillna(penguins_data[column_name].mean())

penguins_data['sex'] = penguins_data['sex'].ffill()


sns.scatterplot(x='bill_length_mm', y='bill_depth_mm', hue='species', data=penguins_data, style='species')


from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in ['sex', 'island']:
    penguins_data[col] = label_encoder.fit_transform(penguins_data[col])


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for col in ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']:
    penguins_data[col] = scaler.fit_transform(penguins_data[[col]])

# 5th cell 
from sklearn.decomposition import PCA


X = penguins_data.drop('species', axis=1)  # Features
Y = penguins_data['species']  # Target variable


PCA_TR = PCA(n_components=3)
X_train = PCA_TR.fit_transform(X)  


PCA_TR.explained_variance_ratio_


pd.DataFrame(PCA_TR.components_, columns=X.columns, index=['PC-1', 'PC-2', 'PC-3'])


LRM = LogisticRegression()  
RFC = RandomForestClassifier()  
KNC = KNeighborsClassifier() 
NBC = GaussianNB()  


SKF = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)


print(f'LogisticRegression : {round(cross_val_score(LRM, X_train, Y, cv=SKF, scoring="accuracy").mean() * 100, 2)}%')
print(f'DecisionTreeClassifier : {round(cross_val_score(DTC, X_train, Y, cv=SKF, scoring="accuracy").mean() * 100, 2)}%')
print(f'RandomForestClassifier : {round(cross_val_score(RFC, X_train, Y, cv=SKF, scoring="accuracy").mean() * 100, 2)}%')
print(f'KNeighborsClassifier : {round(cross_val_score(KNC, X_train, Y, cv=SKF, scoring="accuracy").mean() * 100, 2)}%')
print(f'GaussianNB : {round(cross_val_score(NBC, X_train, Y, cv=SKF, scoring="accuracy").mean() * 100, 2)}%')
