# Imports

In [None]:
import numpy as np 
import pandas as pd 
from PIL import Image
from random import randint
from zipfile import ZipFile
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans 
from math import sqrt, ceil, floor, log
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Specify File Paths

In [None]:
# file paths
data_dir = "/kaggle/input/leaf-classification"
output_dir = "/kaggle/working"
images_file = "images.zip"
train_file = "train.csv.zip"
test_file = "test.csv.zip"

# Read Train and Test Data

In [None]:
# read train data 
train_data = pd.read_csv(f"{data_dir}/{train_file}")
# read test data 
test_data = pd.read_csv(f"{data_dir}/{test_file}")

# Data Description

## 1. Train Data

In [None]:
train_data.describe()

In [None]:
train_data.head()

In [None]:
train_data.columns

In [None]:
margin_stds = train_data.describe().loc["std", "margin1":"margin64"]
shape_stds = train_data.describe().loc["std", "shape1":"shape64"]
texture_stds = train_data.describe().loc["std", "texture1":"texture64"]

In [None]:
print(margin_stds.mean())
print(shape_stds.mean())
print(texture_stds.mean())

In [None]:
standard_deviations_means = [margin_stds.mean(), shape_stds.mean(), texture_stds.mean()]
features = ["margin", "shape", "texture"]

In [None]:
plt.plot(features, standard_deviations_means, marker="o")
plt.xlabel('feature', {"fontsize": 14})
plt.ylabel('average standard deviation', {"fontsize": 14})

### 2. Test Data

In [None]:
test_data.describe()

In [None]:
test_data.head()

In [None]:
test_data.columns

# Data Preprocessing

**Utility Functions**

In [None]:
def check_nulls(data):
    # check null data
    columns = data.columns
    null_values_in_columns = []
    total_null_data = 0

    for column in columns:
        null_data_number_in_column = data[column].isnull().values.sum()
        null_values_in_columns.append(null_data_number_in_column)
        total_null_data += null_data_number_in_column
    return null_values_in_columns

def check_duplicates(data):
    # check duplicate data
    return train_data.duplicated().sum()

def check_data_values(data):
    nulls = check_nulls(data)
    dups = check_duplicates(data)
    
    return pd.DataFrame({"Column": data.columns,
              "Null value": nulls,
              "Duplicated rows": ([dups] * len(data.columns))})

In [None]:
check_data_values(train_data)

In [None]:
check_data_values(test_data)

# Data Visualization

**Utility Functions**

In [None]:
def plot_histogram_with_leaf_id(data, title, random_id):
    number_of_intervals = int(sqrt(len(data)))
    
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    
    plt.title(f"{title} Histogram of Leaf with ID {random_id}", {"fontsize": 12})
    plt.xlabel(f"{title.lower()} value") 
    plt.ylabel('count') 
    plt.hist(data, label="count", bins = number_of_intervals)
    plt.show()
    
def plot_histogram_for_feature(data, title, feature):
    number_of_intervals = int(sqrt(len(data)))
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    plt.title(f"{title} Histogram of {feature}", {"fontsize": 12})
    plt.xlabel(f"{title.lower()} value") 
    plt.ylabel('count') 
    plt.hist(data, label="count", bins = number_of_intervals)
    plt.show()
        
def get_leaf_columns_values(data, column_name):
    column_values = []
    column_data = data.loc[:, f"{column_name}1": f"{column_name}64"]

    for column in column_data.columns:
        column_value = column_data[column].values[0]
        column_values.append(column_value)
        
    return column_values 

In [None]:
# extract images
with ZipFile(f"{data_dir}/{images_file}", 'r') as zip: 
    zip.extractall()

In [None]:
# show some leaf images
for i in range(2):
    random_id = randint(1, 990)
    img = plt.imread(f"{output_dir}/images/{random_id}.jpg")
    
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    x, y = img.shape
    plt.title(f"Leaf Image\nID:{random_id} - size:{x}x{y}\n", {"fontsize": 12})
    imgplot = plt.imshow(img)
    
    image_data_in_train = train_data[train_data['id'] == random_id]
    
    if len(image_data_in_train) > 0:
        margin_values = get_leaf_columns_values(image_data_in_train, "margin")
        plot_histogram_with_leaf_id(margin_values, "Margin", random_id)
        
        texture_values = get_leaf_columns_values(image_data_in_train, "texture")
        plot_histogram_with_leaf_id(texture_values, "Texture", random_id)
        
        shape_values = get_leaf_columns_values(image_data_in_train, "shape")
        plot_histogram_with_leaf_id(shape_values, "Shape", random_id)

### Relationships between First 3 Margin Values

In [None]:
import seaborn as sns
sns.pairplot(train_data.iloc[:, 2:5])
plt.show()

# Data Preparation

**Utility Function**

In [None]:
def split_data(train_data, labels):
    # split data into train and test
    X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.2)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [None]:
label_encoder = LabelEncoder().fit(train_data.species) 
labels = label_encoder.transform(train_data.species)      # encode species names
classes = list(label_encoder.classes_)                    # save column names for submission
    
print(len(classes), classes)
labels = LabelEncoder().fit_transform(train_data.species) # y (targets)

In [None]:
# store test output values
test_y = test_data["id"]

In [None]:
# drop output values
train_data = train_data.drop(['species', 'id'], axis=1)  # X (features)
test_data = test_data.drop(['id'], axis=1)

In [None]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Classification

Classifiers that will be used are:
1. Random Forests
2. Decision Tree
3. Linear Discriminant Analysis

**Utility Function**

In [None]:
def classify(classifier, train_data, test_data):
    X_train, X_test, y_train, y_test = split_data(train_data, labels)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy >> ", acc_score) 
    
    # Predict Test Set
    test_pred = classifier.predict_proba(test_data)
    
    return (y_pred, test_pred, acc_score)

In [None]:
# for further performance evaluation 
non_discretized_results = []
equal_width_discretized_results = None
equal_freq_discretized_results = None
k_means_discretized_results = None

## 1. Random Forests

In [None]:
classifier = RandomForestClassifier(random_state = 0)
pred = classify(classifier, train_data, test_data)
non_discretized_results.append(pred)

print("Max feature importance >>", np.amax(classifier.feature_importances_))
print("Min feature importance >>", np.amin(classifier.feature_importances_))

## 2. Decision Tree

In [None]:
pred = classify(DecisionTreeClassifier(), train_data, test_data)
non_discretized_results.append(pred)

## 3. Linear Discriminant Analysis

In [None]:
pred = classify(LinearDiscriminantAnalysis(), train_data, test_data)
non_discretized_results.append(pred)

# Feature Engineering with Discretization

Discretization is a technique for transforming numerical input or output variables to have discrete ordinal labels.The all data values are real value. In order to improve the accuracies of the classifiers, the columns will be discretized. Discretizers that will be used are:
1. Equal Width Discretizer
2. Equal Frequency Discretizer
3. K Means Discretizer

**Utility Functions**

In [None]:
# to classify the data by all classifiers in one function
def classify_all(train_data, test_data):
    return (classify(RandomForestClassifier(random_state = 0), train_data, test_data),
            classify(DecisionTreeClassifier(), train_data, test_data),
            classify(LinearDiscriminantAnalysis(), train_data, test_data))

In [None]:
def discretize(data, strategy = "uniform", bin_number = 25):
    
    discretizer = KBinsDiscretizer(n_bins=bin_number, encode='ordinal', strategy=strategy)
    data_trans = []
    
    for i, column in enumerate(data):
        column_data = data[column].to_numpy()
        
        # plot histogram of actual data of first 3 columns 
        if i < 3:
            plt.subplot(1, 2, 1)
            plt.title(f"{column}") 
            plt.hist(column_data, bins=bin_number)
        
        # perform discretization
        column_data = column_data.reshape(len(column_data), 1)
        column_trans = discretizer.fit_transform(column_data)
        data_trans.append(pd.DataFrame(column_trans, columns=[f"{column}_trans"]))
        
        # plot histogram of transformed data of first 3 columns 
        if i < 3:
            plt.subplot(1, 2, 2)
            plt.title(f"{column}_trans") 
            plt.hist(column_trans, bins=bin_number)
            plt.show()
        
    # concat discretized columns
    discretized_data = pd.concat(data_trans, axis=1)
    return discretized_data

## 1. Equal Width Discretizer

In [None]:
sqrt_bin_number_train = ceil(sqrt(len(train_data))) # Square-root choice
sturges_bin_number_train = ceil(log(len(train_data), 2)) + 1 # Sturges' formula
avg_bin_number_train = (sqrt_bin_number_train + sturges_bin_number_train) // 2

sqrt_bin_number_test = ceil(sqrt(len(test_data))) # Square-root choice
sturges_bin_number_test = ceil(log(len(test_data), 2)) + 1 # Sturges' formula
avg_bin_number_test = int(sqrt_bin_number_test + sturges_bin_number_test) // 2

print(sqrt_bin_number_train, sturges_bin_number_train, avg_bin_number_train)
print(sqrt_bin_number_test, sturges_bin_number_test, avg_bin_number_test)

In [None]:
discretized_train_data_sqrt_bin = discretize(train_data, bin_number=sqrt_bin_number_train)
discretized_train_data_sturges_bin = discretize(train_data, bin_number=sturges_bin_number_train)
discretized_train_data_avg_bin = discretize(train_data, bin_number=avg_bin_number_train)

In [None]:
discretized_test_data_sqrt_bin = discretize(test_data, bin_number=sqrt_bin_number_test)
discretized_test_data_sturges_bin = discretize(test_data, bin_number=sturges_bin_number_test)
discretized_test_data_avg_bin = discretize(test_data, bin_number=avg_bin_number_test)

In [None]:
discretized_train_data_avg_bin.head()
discretized_test_data_avg_bin.head()

In [None]:
classification_results = {}
classification_results["sqrt"] = classify_all(discretized_train_data_sqrt_bin, discretized_test_data_sqrt_bin)
classification_results["sturges"] = classify_all(discretized_train_data_sturges_bin, discretized_test_data_sturges_bin)
classification_results["avg"] = classify_all(discretized_train_data_avg_bin, discretized_test_data_avg_bin)

In [None]:
# see results
classification_results

In [None]:
equal_width_discretized_results = classification_results

## 2. Equal Frequency Discretizer

In [None]:
discretized_train_data_sqrt_bin = discretize(train_data, strategy="quantile", bin_number=sqrt_bin_number_train)
discretized_train_data_sturges_bin = discretize(train_data, strategy="quantile", bin_number=sturges_bin_number_train)
discretized_train_data_avg_bin = discretize(train_data, strategy="quantile", bin_number=avg_bin_number_train)

In [None]:
discretized_test_data_sqrt_bin = discretize(test_data, strategy="quantile", bin_number=sqrt_bin_number_test)
discretized_test_data_sturges_bin = discretize(test_data, strategy="quantile", bin_number=sturges_bin_number_test)
discretized_test_data_avg_bin = discretize(test_data, strategy="quantile", bin_number=avg_bin_number_test)

In [None]:
discretized_train_data_avg_bin.head()
discretized_test_data_avg_bin.head()

In [None]:
classification_results = {}
classification_results["sqrt"] = classify_all(discretized_train_data_sqrt_bin, discretized_test_data_sqrt_bin)
classification_results["sturges"] = classify_all(discretized_train_data_sturges_bin, discretized_test_data_sturges_bin)
classification_results["avg"] = classify_all(discretized_train_data_avg_bin, discretized_test_data_avg_bin)

In [None]:
# see results
classification_results

In [None]:
equal_freq_discretized_results = classification_results

## 3. K Means Discretizer

In [None]:
discretized_train_data_sqrt_bin = discretize(train_data, strategy="kmeans", bin_number=sqrt_bin_number_train)
discretized_train_data_sturges_bin = discretize(train_data, strategy="kmeans", bin_number=sturges_bin_number_train)
discretized_train_data_avg_bin = discretize(train_data, strategy="kmeans", bin_number=avg_bin_number_train)

In [None]:
discretized_test_data_sqrt_bin = discretize(test_data, strategy="kmeans", bin_number=sqrt_bin_number_test)
discretized_test_data_sturges_bin = discretize(test_data, strategy="kmeans", bin_number=sturges_bin_number_test)
discretized_test_data_avg_bin = discretize(test_data, strategy="kmeans", bin_number=avg_bin_number_test)

In [None]:
discretized_train_data_avg_bin.head()
discretized_test_data_avg_bin.head()

In [None]:
classification_results = {}
classification_results["sqrt"] = classify_all(discretized_train_data_sqrt_bin, discretized_test_data_sqrt_bin)
classification_results["sturges"] = classify_all(discretized_train_data_sturges_bin, discretized_test_data_sturges_bin)
classification_results["avg"] = classify_all(discretized_train_data_avg_bin, discretized_test_data_avg_bin)

In [None]:
# see results
classification_results

In [None]:
k_means_discretized_results = classification_results

# Performance Evaluation

**Utility Functions**

In [None]:
# to label the value in the bar 
def autolabel(ax, rects, values):
    for id, rect in enumerate(rects):
        height = rect.get_height()
        ax.annotate('{:.2f}'.format(values[id]),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 0),textcoords="offset points",
                    ha='center', va='bottom')
   
# plot grouped bar chart
def plot_acc_score_graph(accuracy_scores, train_bin_num, test_bin_num):

    labels = ['Random Forests', 'Decision Trees', 'LDA']

    x = np.arange(len(labels))  # the label locations
    width = 0.2125  # the width of the bars

    fig, ax = plt.subplots()

    # Add some text for labels, title and custom x-axis tick labels, etc.
    title = f'Accuracy Scores by Classifier and Discretizer\n\
            train data bin number = {train_bin_num} - test data bin number = {test_bin_num}'
    ax.set_ylabel('Accuracy Scores')
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(labels)

    x = x - (3*width/2)                    
    for accuracy_score, label in accuracy_scores:
        rect = ax.bar(x, accuracy_score, width, label=label)
        autolabel(ax, rect, accuracy_score)
        x += width

    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

## Performance with Square Root Choice

In [None]:
accuracy_scores = [
                    ([res[2] for res in non_discretized_results], 'Non Discretized'),
                    ([res[2] for res in equal_width_discretized_results["sqrt"]], 'Equal Width'),
                    ([res[2] for res in equal_freq_discretized_results["sqrt"]], 'Equal Freq'),
                    ([res[2] for res in k_means_discretized_results["sqrt"]], 'K Means')]
plot_acc_score_graph(accuracy_scores, sqrt_bin_number_train, sqrt_bin_number_test)

## Performance with Sturges' Formula

In [None]:
accuracy_scores = [
                    ([res[2] for res in non_discretized_results], 'Non Discretized'),
                    ([res[2] for res in equal_width_discretized_results["sturges"]], 'Equal Width'),
                    ([res[2] for res in equal_freq_discretized_results["sturges"]], 'Equal Freq'),
                    ([res[2] for res in k_means_discretized_results["sturges"]], 'K Means')]
plot_acc_score_graph(accuracy_scores, sturges_bin_number_train, sturges_bin_number_test)

## Performance with Average Value

In [None]:
accuracy_scores = [
                    ([res[2] for res in non_discretized_results], 'Non Discretized'),
                    ([res[2] for res in equal_width_discretized_results["avg"]], 'Equal Width'),
                    ([res[2] for res in equal_freq_discretized_results["avg"]], 'Equal Freq'),
                    ([res[2] for res in k_means_discretized_results["avg"]], 'K Means')]
plot_acc_score_graph(accuracy_scores, avg_bin_number_train, avg_bin_number_test)

# Predict Test Data

In [None]:
# choose one of the best classification_results
y_pred, test_pred, acc_score = k_means_discretized_results["avg"][0]

In [None]:
test_pred

In [None]:
# submission dataframe
submission = pd.DataFrame(test_pred, columns=classes)
submission.insert(0, 'id', test_y)
submission.reset_index()

In [None]:
# export 
submission.to_csv('submission.csv', index = False)
submission.head()