In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df=pd.read_csv('/kaggle/input/clustering-penguins-species/penguins.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.dropna(inplace=True)
df.isna().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
def countplot(data, hue=None, column=None, order=None, order_hue=None, saturation=1.0, label_dict=None):
    """
    Parameters:
    - data: DataFrame
        The input DataFrame.
    - hue: str, optional
        Variable in data to map plot aspects to different colors.
    - column: str, optional
        Categorical variable on the x-axis or y-axis.
    - order: list, optional
        Order to plot the categorical levels in the x-axis or y-axis.
    - order_hue: list, optional
        Order to plot the categorical levels in the hue variable.
    - ascending: bool, optional
        Whether to sort the categories in ascending or descending order.
    - saturation: float, optional
        Proportion to desaturate the plot.
    - label_dict: dict, optional
        Dictionary to map values to custom labels.

    """
    plt.figure(figsize=(10, 6))  
    
    if column:
        plt.title(f'Countplot graph for {column} ')
        ax = sns.countplot(x=column, hue=hue, data=data, order=order, hue_order=order_hue, saturation=saturation)

        if order is not None and label_dict is not None:
            plt.xticks(ticks=range(len(order)), labels=[f' {i}: {label_dict[val]}' for i, val in enumerate(order)])

        # Annotate each bar with its count
        for p in ax.patches:
            height = p.get_height()
            ax.text(p.get_x() + p.get_width() / 2., height, f'{height}', ha="center", va="bottom")

    else:
        raise ValueError("Specify either column_x or column_y")

    plt.show()

In [None]:
df.sex.value_counts()

In [None]:
df.drop(df[df['sex']=='.'].index, inplace = True)
df.sex.value_counts()

In [None]:
countplot(data=df,column='sex')

In [None]:
numeric_col=[]
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        numeric_col.append(col)
numeric_col

In [None]:
for col in numeric_col:
    plt.figure(figsize=(20, 10))
    sns.histplot(df[col], kde=True, bins=10, color='skyblue')
    plt.title(f'Histogram Plot: {col} ')    

    plt.tight_layout()
    plt.show()

In [None]:
g = sns.PairGrid(df, hue="sex")
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()

In [None]:
plt.figure(figsize=(12,10))
df.boxplot()

In [None]:
print(df[df["flipper_length_mm"] > 4000])
print(df[df["flipper_length_mm"] < 0])

In [None]:
df=df.drop([9, 14])

In [None]:
df

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [None]:
df['sex']=le.fit_transform(df['sex'])

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(df)
new_df = pd.DataFrame(data=X, columns=df.columns)
new_df.head(10)

In [None]:
pca = PCA(n_components=None)
df_pca = pca.fit(new_df)
df_pca.explained_variance_ratio_
n_components = sum(df_pca.explained_variance_ratio_ > 0.1)
pca = PCA(n_components=n_components)
new_df_pca = pca.fit_transform(new_df)
print(n_components)

In [None]:
sse = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(new_df_pca)
    sse.append(kmeans.inertia_)
plt.plot(range(1, 10), sse, marker="x")
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()

In [None]:
print(f'Number of cluster would be 4 ')

In [None]:
sse

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42).fit(new_df_pca)
plt.scatter(new_df_pca[:, 0], new_df_pca[:, 1], c=kmeans.labels_, cmap="viridis")
plt.xlabel("First Principal Component")
plt.ylabel("Second Principal Component")
plt.title(f"K-means Clustering with 4 clusters")
plt.show()