# Fáza 3 - strojové učenie

Pri dátovej analýze nemusí byť naším cieľom získať len znalosti obsiahnuté v aktuálnych dátach, ale aj natrénovať model, ktorý bude schopný robiť rozumné predikcie pre nové pozorovania pomocou strojového učenia.
V tejto fáze sa od Vás očakáva:

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.stats as sm_stats

import datetime
import re
import category_encoders as ce
from sklearn.impute import SimpleImputer, KNNImputer
from numpy import percentile

import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, SelectFromModel
from sklearn.feature_selection import mutual_info_regression, chi2, f_regression, f_classif
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [2]:
def phase1():
    labor = pd.read_csv("046/labor.csv", sep='\t')
    labor.rename(columns = {"Unnamed: 0": "index"}, inplace = True)
    labor = labor.drop(["index", "name"], axis=1)
    smoker_encoding = {"Y": 1, "N": 0, "yes": 1, "no": 0}
    labor["smoker"].replace(smoker_encoding, inplace=True)

    profiles = pd.read_csv("046/profiles.csv", sep='\t')
    profiles.rename(columns = {"Unnamed: 0": "index"}, inplace = True)
    profiles = profiles.drop(["index"], axis=1)
    profiles["race"].replace({"black": "Black", "white": "White", "blsck": "Black"}, inplace=True)
    profiles["birthdate"] = pd.to_datetime(profiles['birthdate'], utc=False)

    merged = pd.merge(profiles, labor, how='outer', on='ssn')
    merged = merged.drop(["ssn"], axis=1)
    return merged

In [3]:
class handleNA(TransformerMixin):
    def __init__(self, method, strategy=None):
        self.method = method
        self.strategy = strategy
        
    def removeNA(self, merged):
        return merged.dropna()

    def getNAcols(self, merged):
        return merged.columns[merged.isnull().any()].tolist()

    def replaceNaN(self, original_merged):
        na_cols = self.getNAcols(original_merged)
        strategy = self.strategy
        new_merged = original_merged.copy()
        if strategy == "kNN":
            imp_strategy = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
    #         imp_strategy = KNNImputer()
        elif strategy == "mean" or strategy == "median":    
            imp_strategy = SimpleImputer(missing_values=np.nan, strategy=strategy)
        else:
            raise Exception("Unsupported strategy")
        for col in na_cols:
            new_merged[col] = imp_strategy.fit_transform(new_merged[[col]])
        return new_merged
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        if self.method == 'nothing':
            return X
        elif self.method == 'remove':
            return self.removeNA(X);
        elif self.method == 'replace':
            return self.replaceNaN(X)
        else:
            raise Exception("Unsupported method")

In [4]:
class handleOutliers(TransformerMixin):
    def __init__(self, method):
        self.method = method
        
    def onlyNumCols(self, merged):
        return merged.drop(["residence", "job", "company", "name", "birthdate"], axis=1, errors='ignore')

    def identify_outliers(self, merged):
        suma = 0;
        for col in merged.columns:
            q25, q75 = percentile(merged[col], 25), percentile(merged[col], 75)
            iqr = q75 - q25
            cut_off = iqr * 1.5
            lower, upper = q25 - cut_off, q75 + cut_off
            outliers = merged[((merged[col] < lower) | (merged[col] > upper))] 
            print(col, 'Identified outliers: %d' % len(outliers))
            suma += len(outliers)
        print('Sum of identified outliers: %d' % suma)

    def remove_outliers(self, merged):
        newMerged = merged.copy()
        for col in newMerged.columns:
            q25, q75 = percentile(newMerged[col], 25), percentile(newMerged[col], 75)
            iqr = q75 - q25
            cut_off = iqr * 1.5
            lower, upper = q25 - cut_off, q75 + cut_off
            newMerged = newMerged[((newMerged[col] >= lower) & (newMerged[col] <= upper))] 
        return newMerged

    def replace_outliers(self, merged):
        newMerged = merged.copy()
        for col in newMerged.columns:
            q05, q95 = percentile(newMerged[col], 5), percentile(newMerged[col], 95)
            newMerged[col] = np.where(newMerged[col] < q05, q05, newMerged[col])
            newMerged[col] = np.where(newMerged[col] > q95, q95, newMerged[col])
        return newMerged
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        if self.method == 'nothing':
            return self.onlyNumCols(X)
        elif self.method == 'remove':
            return self.remove_outliers(self.onlyNumCols(X))
        elif self.method == 'replace':
            return self.replace_outliers(self.onlyNumCols(X))
        else:
            raise Exception("Unsupported method")

In [5]:
class handleCategorical(TransformerMixin):
    def transformResidenceNLP(self, merged):
        for i in merged['residence'].index:
            country_code = re.findall('[A-Z]{2} [0-9]{5}', str(merged['residence'][i]))[0]
            merged.at[i, 'state']=re.findall('[A-Z]{2}', country_code)[0]
        len(merged['state'].value_counts())
        return merged.drop('residence', axis=1)

    def encodeOrdinal(self, merged):
        transformed = self.transformResidenceNLP(merged)
        ce_ordinal = ce.OrdinalEncoder(cols=['race', 'state', 'blood_group', 'relationship'])
        encoded = ce_ordinal.fit_transform(transformed)
        return encoded

    def frombirthtoage(self, born):
        now = datetime.date.today()
        return now.year - born.year - ((now.month, now.day) < (born.month, born.day))

    def computeAge(self, merged):
        ages = merged['birthdate'].apply(lambda d: self.frombirthtoage(d))
        merged = merged.assign(age=ages.values)
        return merged.drop('birthdate', axis=1)

    def encodeOneHot(self, merged):
        ce_OHE = ce.OneHotEncoder(cols=['sex'], use_cat_names=True)
        merged = ce_OHE.fit_transform(merged)
        return merged

    def fit(self, X):
        return self
    
    def transform(self, X):
        new_data = self.encodeOrdinal(X)
        new_data = self.computeAge(new_data)
        new_data = self.encodeOneHot(new_data)    
        return new_data

In [6]:
class handleTransformations(TransformerMixin):
    def __init__(self, method):
        self.method = method
        
    def transformPower(self, merged):
        power = PowerTransformer(method='yeo-johnson', standardize=True)
        df_return = pd.DataFrame(power.fit_transform(merged), columns = merged.columns)
        return df_return
    
    def transormQuan(self, merged):
        quan = QuantileTransformer(n_quantiles=10, random_state=0)
        df_return = pd.DataFrame(quan.fit_transform(merged), columns = merged.columns)
        return df_return
    
    def scaleMM(self, merged):
        norm_s = MinMaxScaler()
        df_return = pd.DataFrame(norm_s.fit_transform(merged), columns = merged.columns)
        return df_return
        
    def scaleS(self, merged):
        stan_s = StandardScaler()
        df_return = pd.DataFrame(stan_s.fit_transform(merged), columns = merged.columns)
        return df_return
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        if self.method == 'nothing':
            return X
        elif self.method == 'power':
            return self.transformPower(X)
        elif self.method == 'quan':
            return self.transormQuan(X)
        elif self.method == 'minmax':
            return self.scaleMM(X)
        elif self.method == 'standard':
            return self.scaleS(X)
        else:
            raise Exception("Unsupported method")

## 1. Manuálne vytvorenie a vyhodnotenie rozhodovacích pravidiel pre klasifikáciu (5b)
* Naimplementujte 1R algorithm (1R or OneR), ktorý je jednoduchá klasifikácia t.j. rozhodnutie na základe jedného atribútu. Môžete implementovať komplikovanejšie t.j. zahŕňajúce viacero atribútov (ich kombinácie).
* Pravidlá by v tomto kroku mali byť vytvorené manuálne na základe pozorovaných závislostí v dátach. Vyhodnoťte klasifikátor pomocou metrík accuracy, precision a recall.

## 2. Natrénovanie a vyhodnotenie klasifikátora strojového učenia (5b)
* Na trénovanie využite minimálne jeden stromový algoritm strojového učenia v scikit-learn.
* Vizualizujte natrénované pravidlá.
* Vyhodnoťte natrénovaný model pomocou metrík accuracy, precision a recall
* Porovnajte natrénovaný klasifikátor s Vašimi manuálne vytvorenými pravidlami z prvého kroku.

In [7]:
def pipelineGenerator(na_method='remove', na_strategy=None, outliers_method='nothing', 
                      tranformation_method='nothing', select_attributes='all'):
    pipeline =  Pipeline([
        ('handleCategorical', handleCategorical()),
        ('handleNA', handleNA(na_method, na_strategy)), # possible: 'nothing'; 'remove'; 'replace', 'mean'; 'replace', 'median'; 'replace', 'kNN'
        ('handleOutliers', handleOutliers(outliers_method)), # possible: 'nothing'; 'remove'; 'replace'
        ('handleTransformations', handleTransformations(tranformation_method)), # possible: 'nothing'; 'power'; 'quan'; 'minmax'; 'standard'
    ])
    return pipeline

In [8]:
original_data = phase1()
pipeline1 = pipelineGenerator()
transformed_data1 = pipeline1.fit_transform(original_data)

  elif pd.api.types.is_categorical(cols):


In [9]:
X_train, X_test, y_train, y_test = train_test_split(transformed_data1.drop(["indicator"], axis=1), transformed_data1['indicator'], test_size=0.33, random_state=1)

### Algoritmy: 
### 1. Decision tree

In [10]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

**Vizualizácia natrénovaných pravidiel:**

Výsledný strom sa uloží ako png súbor

from graphviz import Source
from IPython.display import Image 

graph = Source(export_graphviz(clf, feature_names=X_train.columns, class_names=['false','true'], filled = True), format='png')
graph

graph.render('decision_tree')
Image(filename='decision_tree.png') 

**Vyhodnotenie Decision tree podľa natrénovaných pravidiel pomocou metrík**

In [11]:
from sklearn.metrics import classification_report
print('y_train_pred:', classification_report(y_train, y_train_pred), '\ny_test_pred:', classification_report(y_test, y_test_pred))

y_train_pred:               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2294
         1.0       1.00      1.00      1.00      4186

    accuracy                           1.00      6480
   macro avg       1.00      1.00      1.00      6480
weighted avg       1.00      1.00      1.00      6480
 
y_test_pred:               precision    recall  f1-score   support

         0.0       0.83      0.82      0.83      1179
         1.0       0.90      0.90      0.90      2013

    accuracy                           0.87      3192
   macro avg       0.86      0.86      0.86      3192
weighted avg       0.87      0.87      0.87      3192



Podľa tohto výpisu vyplýva nastal overfitting.

### 2. Random forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1)
clf.fit(X_train, y_train)
y_train_pred_rt = clf.predict(X_train)
y_test_pred_rt = clf.predict(X_test)

**Vizualizácia natrénovaných pravidiel:**

Výsledný strom sa uloží ako png súbor

from sklearn.tree import  export_graphviz
from graphviz import Source
from IPython.display import Image 
graph = Source(export_graphviz(clf.estimators_[0], feature_names=X_train.columns, class_names=['false','true'], filled = True), format='png')
graph

Model RandomForest na vizualizáciu obsahuje v našom prípade prvý Decision tree v celom RandomForest. (clf.estimators_[0])

graph.render('rf')
Image(filename='rf.png') 

**Vyhodnotenie Random forest podľa natrénovaných pravidiel pomocou metrík**

In [35]:
print('y_train_pred:', classification_report(y_train, y_train_pred_rt), '\ny_test_pred:', classification_report(y_test, y_test_pred_rt))

y_train_pred:               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2294
         1.0       1.00      1.00      1.00      4186

    accuracy                           1.00      6480
   macro avg       1.00      1.00      1.00      6480
weighted avg       1.00      1.00      1.00      6480
 
y_test_pred:               precision    recall  f1-score   support

         0.0       0.89      0.87      0.88      1179
         1.0       0.92      0.94      0.93      2013

    accuracy                           0.91      3192
   macro avg       0.91      0.90      0.90      3192
weighted avg       0.91      0.91      0.91      3192



Podľa tohto výpisu vyplýva, že aj tu nastal overfitting.

Rozhodli sme sa ďalej pracovať s RandomForest algoritmom. Hoci aj ten nám vytvoril overfitting, ale vo všeobecnosti dosahuje lepšie výsledky.

# pre teba:

In [14]:
def report_generator(pred_train, pred_test, y_train, y_test, driver_silent):
    if not driver_silent:
        print("Predicting for train dataset:")
        print(classification_report(y_train, pred_train))

        print("Predicting for test dataset:")
        print(classification_report(y_test, pred_test))
    
    report_train = classification_report(y_train, pred_train, output_dict=True)
    report_test = classification_report(y_test, pred_test, output_dict=True)
    
    return report_train, report_test

**tu je tá driver funkcia:**

In [15]:
def randomForestDriver(X_train, X_test, y_train, y_test, driver_silent=True, max_depth=None):
    cls = RandomForestClassifier(max_depth=max_depth, random_state=1)
    cls.fit(X_train, y_train)
    
    pred_train = cls.predict(X_train)
    pred_test = cls.predict(X_test)
    
    return cls, *report_generator(pred_train, pred_test, y_train, y_test, driver_silent)

In [16]:
cls1, train_report1, test_report1 = randomForestDriver(X_train, X_test, y_train, y_test, driver_silent=False)

Predicting for train dataset:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2294
         1.0       1.00      1.00      1.00      4186

    accuracy                           1.00      6480
   macro avg       1.00      1.00      1.00      6480
weighted avg       1.00      1.00      1.00      6480

Predicting for test dataset:
              precision    recall  f1-score   support

         0.0       0.89      0.87      0.88      1179
         1.0       0.92      0.94      0.93      2013

    accuracy                           0.91      3192
   macro avg       0.91      0.90      0.90      3192
weighted avg       0.91      0.91      0.91      3192



## 3. Optimalizácia - hyperparameter tuning (5b)
* Preskúmajte hyperparametre Vášho zvoleného klasifikačného algoritmu v druhom kroku a vyskúšajte ich rôzne nastavenie tak, aby ste minimalizovali overfitting (preučenie) a optimalizovali výsledok. 
* Vysvetlite, čo jednotlivé hyperparametre robia. Pri nastavovaní hyperparametrov algoritmu využite krížovú validáciu (cross validation) na trénovacej množine.

\* odkaz len pre mňa https://www.analyticsvidhya.com/blog/2021/06/understanding-random-forest/

Klasifikačný algoritmus RandomForest má dokopy 18 parametrov, z toho 7 je kľučových:
- max_depth
- min_sample_split
- max_leaf_nodes
- min_samples_leaf
- n_estimators
- max_sample (bootstrap sample)
- max_features

Aby sme optimalizovali parametre, a tak vylepšili úspešnosť, pričom príliš nezaťažili naše výpočtové zdroje, rozhodli sme sa použiť tieto:
- criterion: funkcia na meranie kvality rozdelenia (gini, entropy)
- max_features: maximálny počet atribútov, ktorý je potrebné zvážiť počas každého delenia (log2, 1 - 10)
- n_estimators: počet stromov pred priemerovaním predikcií (5, 10, 50, 100, 200)
- min_sample_leaf: minimálny počet vzoriek, ktoré majú pripadnúť na jeden leaf node (2, 5, 10, 20, 50)

Na zvýšenie rýchlosti vykonávania sme sa rozhodli pridať parameter random_state a neobmedzili sme počet procesorov, na ktorých pôjde výpočet (n_jobs).

Parameter verbose slúži na to, koľko výpisov má zobraziť. My sme použili verbose=1, aby sme základé výpisy videli, ako napr. počet kandidátov a počet fits.

In [17]:
params = {
    'criterion': ['gini', 'entropy'],
    'max_features': ['log2', range(1, 11)],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

In [18]:
cls = RandomForestClassifier(random_state=1)

#### GridSearch Cross Validation

In [19]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

gridCV = GridSearchCV(cls, params, cv=3, verbose=1, n_jobs=-1)

#### RandomizedSearch Cross Validation

In [20]:
randomCV = RandomizedSearchCV(cls, params, cv=7, verbose=1, n_jobs=-1)

In [21]:
gridCV.fit(X_train,y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


 0.90108025 0.90092593 0.89953704 0.90277778 0.9029321  0.90200617
 0.88811728 0.89012346 0.89166667 0.89675926 0.89614198 0.89305556
 0.86697531 0.87253086 0.87037037 0.8712963  0.87160494 0.87098765
 0.85308642 0.85046296 0.84722222 0.85154321 0.85679012 0.85169753
 0.8242284  0.80864198 0.80216049 0.81604938 0.81790123 0.82469136
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.9037037  0.90833333 0.90725309 0.90925926 0.91234568 0.91141975
 0.9        0.90277778 0.9037037  0.90447531 0.90679012 0.90524691
 0.88996914 0.88811728 0.88873457 0.89444444 0.89614198 0.89351852
 0.87561728 0.87746914 0.87777778 0.87700617 0.87453704 0.8726

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=1), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['log2', range(1, 11)],
                         'min_samples_leaf': [5, 10, 20, 50, 100, 200],
                         'n_estimators': [10, 25, 30, 50, 100, 200]},
             verbose=1)

In [22]:
randomCV.fit(X_train,y_train)

Fitting 7 folds for each of 10 candidates, totalling 70 fits


 0.89799078        nan        nan        nan]


RandomizedSearchCV(cv=7, estimator=RandomForestClassifier(random_state=1),
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_features': ['log2', range(1, 11)],
                                        'min_samples_leaf': [5, 10, 20, 50, 100,
                                                             200],
                                        'n_estimators': [10, 25, 30, 50, 100,
                                                         200]},
                   verbose=1)

In [23]:
gridCV.best_score_

0.9123456790123456

In [24]:
bestGrid = gridCV.best_estimator_
bestGrid

RandomForestClassifier(criterion='entropy', max_features='log2',
                       min_samples_leaf=5, random_state=1)

In [25]:
randomCV.best_score_

0.9146576381997547

In [26]:
bestRandom = randomCV.best_estimator_
bestRandom

RandomForestClassifier(max_features='log2', min_samples_leaf=5,
                       n_estimators=200, random_state=1)

In [27]:
gridPred_train = bestGrid.predict(X_train)

In [28]:
gridPred = bestGrid.predict(X_test)

In [29]:
print(classification_report(y_train, gridPred_train)) 

              precision    recall  f1-score   support

         0.0       0.98      0.95      0.96      2294
         1.0       0.97      0.99      0.98      4186

    accuracy                           0.97      6480
   macro avg       0.97      0.97      0.97      6480
weighted avg       0.97      0.97      0.97      6480



In [30]:
print(classification_report(y_test, gridPred)) 

              precision    recall  f1-score   support

         0.0       0.89      0.87      0.88      1179
         1.0       0.93      0.94      0.93      2013

    accuracy                           0.91      3192
   macro avg       0.91      0.91      0.91      3192
weighted avg       0.91      0.91      0.91      3192



In [31]:
randomPred_train = bestRandom.predict(X_train)

In [32]:
randomPred = bestRandom.predict(X_test)

In [33]:
print(classification_report(y_train, randomPred_train)) 

              precision    recall  f1-score   support

         0.0       0.97      0.94      0.96      2294
         1.0       0.97      0.99      0.98      4186

    accuracy                           0.97      6480
   macro avg       0.97      0.96      0.97      6480
weighted avg       0.97      0.97      0.97      6480



In [34]:
print(classification_report(y_test, randomPred)) 

              precision    recall  f1-score   support

         0.0       0.90      0.85      0.87      1179
         1.0       0.91      0.94      0.93      2013

    accuracy                           0.91      3192
   macro avg       0.91      0.90      0.90      3192
weighted avg       0.91      0.91      0.91      3192



Obidva spôsoby nemajú overfitting, preto možno pokladať výsledky za akceptovateľné, ktoré vylepšili accuracy.

## 4. Vyhodnotenie vplyvu zvolenej stratégie riešenia na klasifikáciu (5b)
Vyhodnotíte Vami zvolené stratégie riešenia projektu z hľadiska classification accuracy: 

* Stratégie riešenia chýbajúcich hodnôt a outlierov;
* Scaling resp. transformer či zlepší accuracy klasifikácie;
* Výber atribútov a výber algoritmov;
* Hyperparameter tuning resp. ensemble learning.

Ktorý spôsob z každého hore-uvedených bodov sa ukázal ako vhodnejší pre daný problém? Vyhodnotenie podložíte dôkazmi.

Správa sa odovzdáva v 12. týždni semestra
* Na cvičení, dvojica svojmu cvičiacemu odprezentuje vykonanú prácu v Jupyter Notebooku.
* Správu elektronicky odovzdá jeden člen z dvojice do systému AIS do nedele 12.12.2021 23:59.