In [59]:
!pip install Pillow



In [1]:
# Laden relevante Bibliotheken
import pandas as pd
import numpy as np
import glob
import PIL
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier


# Preprocessing

Uns liegen Audiodaten und Ihre entsprechenden Spektogrammen vor. Wir müssen also die Bilder/Spektogramme in ein Format konvertieren, das von unseren Modellen verstanden werden kann.

Beschaffung der eigentlichen Bildarrays; dies erzeugt eine sehr hochdimensionale Featuremenge. Das führt aber eine Vrschlechtung der Accurancy des Models.

### Konverierung der Bilder auf Matricies

In [2]:
data_folder = ""
data = pd.read_csv(f'{data_folder}data.csv', sep=";")
data.drop_duplicates(inplace=True)
data.head()

Unnamed: 0,Filename,ID,Species
0,1504695082.jpg,1504695082,Bechsteinfledermaus
1,1504714350.jpg,1504714350,Bechsteinfledermaus
2,1504715079.jpg,1504715079,Bechsteinfledermaus
3,1504716272.jpg,1504716272,Bechsteinfledermaus
4,1504716318.jpg,1504716318,Bechsteinfledermaus


In [None]:
nx = 288
ny = 432
nrgb = 4

# verwende glob zu parsen alle png Files
filenames = glob.glob(f'images/*.png')
# Liste erstellen, die die Bild-Matricies enthält
arr_list = []
# Liste erstellen, die die File-Namen enthält
names = []

for f in filenames:
    
    # den File Name z.B 1509260730 nehmen
    name = f[22:-4]
    names.append(name)
    
    # Bild zu einem Vektor vorverarbeiten
    image = Image.open(f)
    arr = np.asarray(image)
    arr = arr/255.
    arr = arr.reshape((1,nx*ny*nrgb))
    arr_list.append(arr)
    
df = pd.DataFrame(np.concatenate(arr_list))
df['ID'] = names
df['ID'] = df['ID'].astype(int)

In [30]:
# Die Dataframes verbinden, um die Klassenbezeichnungen zu erhalten
merged_data = pd.merge(df, data[['ID', 'Species']], on='ID', how='left')
merged_data.set_index('ID', inplace=True)
merged_data.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,497655,497656,497657,497658,497659,497660,497661,497662,497663,Species
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1504695082,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,Bechsteinfledermaus
1504714350,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,Bechsteinfledermaus
1504715079,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,Bechsteinfledermaus
1504716272,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,Bechsteinfledermaus
1504716318,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,Bechsteinfledermaus


### Train a classifier

In [63]:
# Ermitteln der Labels
data_y = merged_data.Species 
# Ermitteln der Features
data_x = merged_data.drop(['Species'], axis=1)

# train-test split
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, shuffle='True')

In [64]:
clf = DecisionTreeClassifier(max_depth=10,criterion='gini', random_state=100) 


clf.fit(x_train,y_train)


DecisionTreeClassifier(max_depth=10, random_state=100)

In [65]:
y_pred = clf.predict(x_test)

In [66]:
acc = accuracy_score(y_pred, y_test)
print(f"Test set accuracy: {acc}")

Test set accuracy: 0.3162393162393162


### Hyperparameter tuning

max depth is a hyperparameter of the Decision Tree that we can tune using cross validation, others can be max_leaf_nodes, max_features etc..

We will evaluate our decision tree classifier on various max_depth values and choose the best value

In [67]:
depths = [i for i in range(5,10)] + [None]


parameters = {
    'max_depth' : depths, 
}

decision_tree = DecisionTreeClassifier()

tuned_clf = GridSearchCV(decision_tree,                    # Modell
                         param_grid = parameters,   # Hyperparameter
                         scoring='accuracy',        # Accuracy
                         cv=3)                     # Folds-Anzahl


tuned_clf.fit(x_train,y_train)
print("Tuned Hyperparameters :", tuned_clf.best_params_)
print("Accuracy :",tuned_clf.best_score_)

Tuned Hyperparameters : {'max_depth': 6}
Accuracy : 0.42795698924731185


In [43]:
# Erneute Überprüfung der Ergebnisse auf der Testmenge unter Verwendung des besten Hyperparameters. 

best_clf = DecisionTreeClassifier(max_depth=tuned_clf.best_params_["max_depth"]) 


clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_pred, y_test)
print(f"Test set accuracy: {acc}")

Test set accuracy: 0.36752136752136755


In [37]:
# Now repeat the same process using a Random Forest
# Hint some hyperparameters that might interest you are max_depth, n_estimators, max_features
# Check the documentation here https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

Note that when you run the results again you will get different results if you do not set the random_state in the classifiers and in the train_test_split or anywhere else that there is randomness

In [69]:
#repeat using a Random Forest


In [70]:
clf = RandomForestClassifier(max_depth=10,criterion='gini', random_state=100) 

# fit the classifier
clf.fit(x_train,y_train)


RandomForestClassifier(max_depth=10, random_state=100)

In [71]:
y_pred = clf.predict(x_test)

In [72]:
acc = accuracy_score(y_pred, y_test)
print(f"Test set accuracy: {acc}")

Test set accuracy: 0.5042735042735043


In [None]:
depths = [i for i in range(5,10)] + [None]

# you can add more hyperparams in the parameters dictionary
parameters = {
    'max_depth' : depths, 
}

decision_tree = RandomForestClassifier()

tuned_clf = GridSearchCV(decision_tree,                    # model
                         param_grid = parameters,   # hyperparameters
                         scoring='accuracy',        # metric for scoring
                         cv=3)                     # number of folds


tuned_clf.fit(x_train,y_train)
print("Tuned Hyperparameters :", tuned_clf.best_params_)
print("Accuracy :",tuned_clf.best_score_)

In [54]:
best_clf = RandomForestClassifier(max_depth=tuned_clf.best_params_["max_depth"]) 

# fit the classifier
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_pred, y_test)
print(f"Test set accuracy: {acc}")

Test set accuracy: 0.5470085470085471
