In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from ChernoffFace import *
import numpy
import matplotlib.cm
import seaborn as sns
from matplotlib.lines import Line2D
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
!python -m pip install ChernoffFace

In [None]:
!wget -q https://storage.googleapis.com/download.tensorflow.org/data/palmer_penguins/penguins.csv -O /tmp/penguins.csv
df = pd.read_csv("/tmp/penguins.csv")
df.head()

In [None]:
df = df.dropna() #usuwanie wartości NaN
df.head()

In [None]:
label_encoder = LabelEncoder()
df['island'] = label_encoder.fit_transform(df['island'])
df.head()

In [None]:
X = df.values[:, 1:6]
y = df.values[:, 0]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
def rescale(x):
    return (x - np.min(x)) / np.ptp(x)

penguins_radar = (
    df.groupby('species').agg(
        avg_bill_length = ("bill_length_mm", np.mean),
        avg_bill_depth = ("bill_depth_mm", np.mean),
        avg_flipper_length = ("flipper_length_mm", np.mean),
        avg_body_mass = ("body_mass_g", np.mean)
    )
    .apply(lambda x: rescale(x))
    .reset_index()
)

BG_WHITE = "#FFFFFF"
BLUE = "#2a475e"
GREY70 = "#b3b3b3"
GREY_LIGHT = "#f2efe8"
COLORS = ["#FF5A5F", "#FFB400", "#007A87"]

# The three species of penguins
SPECIES = penguins_radar["species"].values.tolist()

# The four variables in the plot
VARIABLES = penguins_radar.columns.tolist()[1:]
VARIABLES_N = len(VARIABLES)

# The angles at which the values of the numeric variables are placed
ANGLES = [n / VARIABLES_N * 2 * np.pi for n in range(VARIABLES_N)]
ANGLES += ANGLES[:1]

# Padding used to customize the location of the tick labels
X_VERTICAL_TICK_PADDING = 5
X_HORIZONTAL_TICK_PADDING = 50

# Angle values going from 0 to 2*pi
HANGLES = np.linspace(0, 2 * np.pi)

# Used for the equivalent of horizontal lines in cartesian coordinates plots
# The last one is also used to add a fill which acts a background color.
H0 = np.zeros(len(HANGLES))
H1 = np.ones(len(HANGLES)) * 0.5
H2 = np.ones(len(HANGLES))

# Initialize layout ----------------------------------------------
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, polar=True)

fig.patch.set_facecolor(BG_WHITE)
ax.set_facecolor(BG_WHITE)

# Rotate the "" 0 degrees on top.
# There it where the first variable, avg_bill_length, will go.
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)

# Setting lower limit to negative value reduces overlap
# for values that are 0 (the minimums)
ax.set_ylim(-0.1, 1.05)

# Plot lines and dots --------------------------------------------
for idx, species in enumerate(SPECIES):
    values = penguins_radar.iloc[idx].drop("species").values.tolist()
    values += values[:1]
    ax.plot(ANGLES, values, c=COLORS[idx], linewidth=4, label=species)
    ax.scatter(ANGLES, values, s=160, c=COLORS[idx], zorder=10)

# Set values for the angular axis (x)
ax.set_xticks(ANGLES[:-1])
ax.set_xticklabels(VARIABLES, size=14)

# Remove lines for radial axis (y)
ax.set_yticks([])
ax.yaxis.grid(False)
ax.xaxis.grid(False)

# Remove spines
ax.spines["start"].set_color("none")
ax.spines["polar"].set_color("none")

# Add custom lines for radial axis (y) at 0, 0.5 and 1.
ax.plot(HANGLES, H0, ls=(0, (6, 6)), c=GREY70)
ax.plot(HANGLES, H1, ls=(0, (6, 6)), c=COLORS[2])
ax.plot(HANGLES, H2, ls=(0, (6, 6)), c=GREY70)

# Now fill the area of the circle with radius 1.
# This create the effect of gray background.
ax.fill(HANGLES, H2, GREY_LIGHT)

# Custom guides for angular axis (x).
# These four lines do not cross the y = 0 value, so they go from
# the innermost circle, to the outermost circle with radius 1.
ax.plot([0, 0], [0, 1], lw=2, c=GREY70)
ax.plot([np.pi, np.pi], [0, 1], lw=2, c=GREY70)
ax.plot([np.pi / 2, np.pi / 2], [0, 1], lw=2, c=GREY70)
ax.plot([-np.pi / 2, -np.pi / 2], [0, 1], lw=2, c=GREY70)

# Add levels -----------------------------------------------------
# These labels indicate the values of the radial axis
PAD = 0.05
ax.text(-0.4, 0 + PAD, "0%", size=16, fontname="Roboto")
ax.text(-0.4, 0.5 + PAD, "50%", size=16, fontname="Roboto")
ax.text(-0.4, 1 + PAD, "100%", size=16, fontname="Roboto")

# Create and add legends -----------------------------------------
# Legends are made from scratch.

# Iterate through species names and colors.
# These handles contain both markers and lines.
handles = [
    Line2D(
        [], [],
        c=color,
        lw=3,
        marker="o",
        markersize=8,
        label=species
    )
    for species, color in zip(SPECIES, COLORS)
]

legend = ax.legend(
    handles=handles,
    loc=(1, 0),       # bottom-right
    labelspacing=1.5, # add space between labels
    frameon=False     # don't put a frame
)

# Iterate through text elements and change their properties
for text in legend.get_texts():
    text.set_fontname("Roboto") # Change default font
    text.set_fontsize(16)       # Change default font size

# Adjust tick label positions ------------------------------------
XTICKS = ax.xaxis.get_major_ticks()
for tick in XTICKS[0::2]:
    tick.set_pad(X_VERTICAL_TICK_PADDING)

for tick in XTICKS[1::2]:
    tick.set_pad(X_HORIZONTAL_TICK_PADDING)

# Add title ------------------------------------------------------
fig.suptitle(
    "Radar Plot of Penguin Species",
    x = 0.1,
    y = 1,
    ha="left",
    fontsize=32,
    fontname="Lobster Two",
    color=BLUE,
    weight="bold",
)

In [None]:
sns.set_theme(style="ticks")

sns.pairplot(df, hue="species")

In [None]:
dfData2 = variables_rescale(X)

fig = chernoff_face(data=dfData2,
                    n_columns=5,
                    long_face=False,
                    color_mapper=matplotlib.cm.tab20b,
                    figsize=(8, 8), dpi=200)

# Display
fig.tight_layout()
matplotlib.pyplot.show()

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

linkage_matrix=linkage(X,method='ward')


plt.figure(figsize=(15,10))
dendrogram(linkage_matrix,labels=df['species'].values, leaf_rotation=90)
plt.title("Hierarchical clustering Dendogram")
plt.xlabel('Species')
plt.ylabel('Distance')
plt.show()

In [None]:
scaler = StandardScaler()
scaler.fit(X)
scaled_data = scaler.transform(X)
pca = PCA(n_components=4)
x_pca = pca.fit_transform(scaled_data)

In [None]:
print('\nOpisywana zmienność: ', pca.explained_variance_ratio_)
sns.barplot(x=np.arange(1,5), y=pca.explained_variance_ratio_, color='blue')
plt.show()

In [None]:
plt.figure(figsize =(8, 6))
sns.scatterplot(x=x_pca[:,0],
y=x_pca[:,1],hue=df['species'],palette="pastel")
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

In [None]:
# Wyniki - rzut na PC2/PC3
plt.figure(figsize =(8, 6))
sns.scatterplot(x=x_pca[:,1], y=x_pca[:,2], hue=df['species'], palette="pastel")
plt.xlabel('PC2')
plt.ylabel('PC3')
plt.legend()
plt.show()

In [None]:
# Wyniki - rzut na PC1/PC3
plt.figure(figsize =(8, 6))
sns.scatterplot(x=x_pca[:,0], y=x_pca[:,2], hue=df['species'], palette="pastel")
plt.xlabel('PC1')
plt.ylabel('PC3')
plt.legend()
plt.show()

In [None]:
clf = tree.DecisionTreeClassifier()
# definicja klasyfikatora
scores = cross_val_score(clf, X, y, cv=5)
# walidacja krzyżowa
print('Accuracy (mean): ', scores.mean())

In [None]:
clf = clf.fit(X, y) # uczenie klasyfikatora
plt.figure(figsize=(16,7)) #odpowiednio dopasować: figsize, fontsize
tree.plot_tree(clf, fontsize=7, filled=True,
feature_names=df.columns[1:6], class_names=df["species"].unique())
plt.savefig('DTiris.png') #zapis do pliku
plt.show()
plt.show()

In [None]:
# Badanie istotności cech
print('\nFeature importances')
importances = pd.Series(clf.feature_importances_, index=df.columns[1:6])
print(importances)
importances.nlargest(5).plot(kind='barh')
plt.xlabel('Relative importance')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# obiekt klasyfikatora – liczba drzew i inne hiperparametry do optymalizacji
clf = RandomForestClassifier(n_estimators=10, random_state=10)
clf.fit(X_train, y_train) # trenowanie klasyfikatora
y_test_pred = clf.predict(X_test)
# dokładność dla zbioru testowego
print('\nAccuracy (test):', accuracy_score(y_test, y_test_pred))

In [None]:
# Badanie istotności cech
print('\nFeature importances')
importances = pd.Series(clf.feature_importances_, index=df.columns[1:6])
print(importances)
importances.nlargest(5).plot(kind='barh')
plt.xlabel('Relative importance')
plt.show()

In [None]:
# Macierz pomyłek i raport z klasyfikacji
print(confusion_matrix(y_test,y_test_pred))
print(classification_report(y_test,y_test_pred))

In [None]:
import matplotlib.pyplot as plt

from sklearn import datasets, svm
from sklearn.inspection import DecisionBoundaryDisplay

X = X[:, 1:3]

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
models = (
    svm.SVC(kernel="linear", C=C),
    svm.LinearSVC(C=C, max_iter=10000),
    svm.SVC(kernel="rbf", gamma=0.7, C=C),
    svm.SVC(kernel="poly", degree=3, gamma="auto", C=C),
)
models = (clf.fit(X, y) for clf in models)

# title for the plots
titles = (
    "SVC with linear kernel",
    "LinearSVC (linear kernel)",
    "SVC with RBF kernel",
    "SVC with polynomial (degree 3) kernel",
)

# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]

for clf, title, ax in zip(models, titles, sub.flatten()):
    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax,
        xlabel=df.columns[2],
        ylabel=df.columns[3],
    )
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()