# Exercise and Feature Engineering II

In the first two parts we learned a lot about the general ML-workflow. We could achieve quite reasonable results for our classification task. We already performed some feature engineering and applied a transformation to an alternative representations of the RGB color model.

As mentioned before, deriving a meaningful and valid ML-model has to be an iterative process. Directly try to improve the ML-model by hyperparameter tuning is often not the first choice. So let’s go back to the feature engineering part.

Right now we use single color measurements of a marble to classify it. An easy way to improve the performance is to combine several measurements for each marble. In our example we will implement it by creating a tuple out of single measurements. Thereby, we will `stack` single samples to a single event.

In [None]:
import os
import numpy as np
import pandas as pd
import zipfile
import matplotlib.pyplot as plt

from IPython.display import display, clear_output, Markdown

## Data Import, Preparation, Feature Engineering I

In [None]:
def parse_lines(lines):
    """ Parse strings of marble data"""
    lines = lines[2:-2]
    rows = [d.split(', ') for d in lines.split('), (')]
    data = [[int(v.replace(')][(', '')) for v in r] for r in rows]
    return pd.DataFrame(data)[[0, 1, 2]]

files = [
    'blue-white-glass.data',
    'cyan-glass.data',
    'glass-blue.data',
    'glass-green.data',
    'glass-red.data',
    'glass-yellow.data',
    'planet-black-blue.data',
    'planet-green.data',
    'planet-ocean.data',
]

dfs = []
for i, fname in enumerate(files):
    print(f'Load data {i}: {fname}')

    with zipfile.ZipFile(f'../.assets/data/marbles/{fname}.zip', 'r') as zipf:
        with zipf.open(f'{fname}', 'r') as infile:
            content = infile.readlines()[0].decode()
            dfs.append(parse_lines(content).assign(color=f'{fname}'.replace('.data', '')))

df = pd.concat(dfs)
df.columns=['R', 'G', 'B', 'color']

In [None]:
def generate_xy_values(df):
    df['X'] = 0.5 * np.sqrt(3) * df['G'] - 0.5 * np.sqrt(3) * df['B']
    df['Y'] = df['R'] - (1 / 3 * df['G']) - (1 / 3 * df['B'])
    
def generate_intensity_values(df):
    df['I'] = np.square(df['X']) + np.square(df['Y'])

def generate_angles(df):
    df['Phi'] = np.arctan2(df['Y'], df['X'])

# Feature Engineering I - we can add later   
#generate_xy_values(df)
#generate_intensity_values(df)
#generate_angles(df)

# Add target ID
ids = {'blue-white-glass': 0,
      'cyan-glass': 1,
      'glass-blue': 2,
      'glass-green': 3,
      'glass-red': 4,
      'glass-yellow': 5,
      'planet-black-blue': 6,
      'planet-green': 7,
      'planet-ocean': 8,}

df['cat'] = df['color'].map(ids)

df.sample(5)

In [None]:
df.shape

## Feature Engineering II

In [None]:
# How many samples per event
VALUES_PER_EVENT = 15

# List of data sets
dfs = []

for i in range(9):
    
    # Get all samples of one type
    df_i = df[df['cat']==i]
    
    # Delete color and cat
    df_i = df_i.drop(labels=['cat'], axis=1)
    color = df_i.color[0]
    df_i = df_i.drop(labels=['color'], axis=1)
    
    # Create Package
    df_i['meas'] = (df_i.index.values / VALUES_PER_EVENT).astype('Int64')
    df_i['index'] = df_i.index % VALUES_PER_EVENT
 
    # Create mean
    rm = df_i.groupby('meas')['R'].mean()
    gm = df_i.groupby('meas')['G'].mean()
    bm = df_i.groupby('meas')['B'].mean()
    
    # Create standard deviations
    rv = df_i.groupby('meas')['R'].std()
    gv = df_i.groupby('meas')['G'].std()
    bv = df_i.groupby('meas')['B'].std()
    
    # Unstack
    df_i = df_i.set_index(['meas','index']).unstack(level=1)
    df_i.columns = [ f'{x}_{y}' for x in df_i.columns.levels[0] for y in df_i.columns.levels[1]]
    
    # Add mean and std 
    df_i['R_M'] = rm
    df_i['G_M'] = gm
    df_i['B_M'] = bm
    
    df_i['R_V'] = rv
    df_i['G_V'] = gv
    df_i['B_V'] = bv
    
    # Add target
    df_i['cat'] = i
    df_i['color'] = color
    
    # Add to list of data sets
    dfs.append(df_i)

# Combine to one dataframe    
df = pd.concat(dfs) 

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.sample(5)

In [None]:
# Mix data set
df = df.sample(frac=1)

### Start with ML
- Your task is to built up the ML workflow. Start with defining the training features, set up training and test data set, import relevant modules...

- You may want to give it another try with unsupervised learning (e.g. Gaussian Mixture)!

In [None]:
 # It's your turn!

training_features = []
target = ['']

X = []
y = []

#......






















































# Sample solution

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score

In [None]:
training_features = [c  for c in df.columns if (('R' in c) or ('G' in c) or ('B' in c))]

target = ['cat']

X = df[training_features + target].dropna()
y = X[target]
X.drop(target, axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
model = RandomForestClassifier(n_estimators=10, max_depth=10) # Start
#model = RandomForestClassifier(n_estimators=100, max_depth=None) # TOP

In [None]:
# Fit des Modells
model.fit(X_train, y_train.values[:,0])

## Validation

### Confusion matrix

In [None]:
y_pred_test = model.predict(X_test)
truth = y_test 
cm = confusion_matrix(truth,y_pred_test)

pd.DataFrame(data=cm)

In [None]:
import itertools

plt.figure(figsize=(8, 8))
plt.imshow(cm, interpolation='nearest', cmap='viridis',vmin=0, vmax=df.shape[0]/9*0.2)
plt.colorbar()
for i, j in itertools.product(range(9), range(9)):
        plt.text(j, i, f'{cm[i, j]:.0f}', horizontalalignment="center",color="white" if not i==j else "black")
plt.title('Confusion Matrix')
plt.ylabel('True class')
plt.xlabel('Predicted class');        

### Hypothesis test

In [None]:
#cat = [0,1,2,3,4,5,6,7,8]
cat = [0, 4, 8]

y_proba_test = model.predict_proba(X_test)

for i in cat:
    y_proba_test_i = y_proba_test[:,i]
    plt.figure(figsize=(8, 4))
    
    for j in range(9):
        plt.hist(y_proba_test_i[y_test['cat'] == j], 
                 bins=np.linspace(0,1,100), 
                 alpha=0.5, 
                 density=False, 
                 label=f'Type {j}')        
    
    plt.title(f'Hypothesis: Marble belongs to type {i}')
    plt.xlabel('Probability')   
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    plt.yscale('log', nonposy='clip')
    plt.show()

In [None]:
#cat = [0,1,2,3,4,5,6,7,8]
cat = [0, 4, 8]

y_proba_test = model.predict_proba(X_test)
y_proba_train = model.predict_proba(X_train)
   
for i in cat:
    y_proba_test_i = y_proba_test[:,i]
    y_proba_train_i = y_proba_train[:,i]
    
    plt.figure(figsize=(5, 5))
    plt.plot(*roc_curve(y_test == i, y_proba_test_i)[:2], label='test')
    plt.plot(*roc_curve(y_train == i, y_proba_train_i)[:2], label='train')
    plt.plot([0, 1],[0, 1], color='black', linestyle=':')
    plt.title(f'ROC curve type {i}')
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate') 
    plt.legend(loc='best')
    plt.show();   

### Feature Importance

In [None]:
plt.figure(figsize=(6, 10))
plt.barh(range(len(X.columns)), model.feature_importances_)
plt.yticks(range(len(X.columns)), X.columns)
plt.show()

### AUC and Accuracy

In [None]:
data=[]
for i in range(9):
    y_proba_test_i = y_proba_test[:,i]
    data.append(roc_auc_score(y_test.values == i, y_proba_test_i))
    
# Displaying
pd.DataFrame(np.array(data), columns=['AUC'])

In [None]:
print(f'Mean Accuracy: {model.score(X_test, y_test):.3f}')

## Unsupervised learning with Gaussian Mixture

In [None]:
import matplotlib.cm as cm
cmap = cm.get_cmap('Set1')

In [None]:
cat = [0,1,2,3,4,5,6,7,8]

X = df[df['cat'].isin(cat)][['R_M','G_M','B_M','R_V','B_V','G_V','cat']]

# Reduced data set size
X = X.sample(10000)

# Define target for visualiztion
target = X['cat']
X=X.drop(['cat'],axis=1).values

In [None]:
# Raw data
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], color=cmap(target), s=5, label ='Truth')
plt.legend();

In [None]:
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=9, init_params='kmeans')
model.fit(X)
predictions = model.predict(X)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], s=20, color = cmap(predictions), alpha=0.5, label='Predictions Gaussian Mixture')
plt.legend();

---
_This notebook is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). Copyright © 2018-2025 [Point 8 GmbH](https://point-8.de)_