# Notebook example

Installing some necessary packages:

In [1]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!jupyter labextension install @jupyter-widgets/jupyterlab-manager



Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m
Traceback (most recent call last):
  File "/home/rodrigo/.local/bin/jupyter-labextension", line 7, in <module>
    from jupyterlab.labextensions import main
ModuleNotFoundError: No module named 'jupyterlab'


In [4]:
!pip install xgboost
!pip install scikit-learn==0.24



**It is necessary to change the working directory so the project structure works properly:**

In [1]:
import sys
sys.path.append("../../")

From this point, it's on you!

---

In [2]:
import pandas as pd
import numpy as np

from ml.data_source.spreadsheet import Spreadsheet
from ml.preprocessing.preprocessing import Preprocessing
from ml.preprocessing.feature_selection import FeatureSelector
from ml.model.trainer import TrainerSklearn
from ml.preprocessing.normalization import Normalizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.svm import LinearSVC
from sklearn.feature_selection import mutual_info_classif 
from skfeature.function.similarity_based import fisher_score 

In [3]:
from sklearn.feature_selection import chi2

In [4]:
df = Spreadsheet().get_data('../../../data/raw/train.csv',columns=['Survived','Pclass','Sex','Age'])

In [5]:
p = Preprocessing()
estimator = LinearSVC()
#f = FeatureSelector('exaustive', estimator = estimator, min_features = 3, max_features = 4)
#f = FeatureSelector('recursive', estimator = LinearSVC(), n_features_to_select=2)
#f = FeatureSelector('sequential', estimator = LinearSVC(), direction='forward')
#f = FeatureSelector('model', estimator = estimator)
#f = FeatureSelector('variance', threshold=0.5)
#f = FeatureSelector('correlation', threshold=0.9)
#f = FeatureSelector('univariate_kbest', score_func=FeatureSelector.mean_abs_diff, k=2)
#f = FeatureSelector('univariate_percentile', score_func=chi2, percentile=50)
#f = FeatureSelector('coefficients', model=estimator, num_feat = 2)
f = FeatureSelector('ensemble', dic_selection={ 'variance': {'threshold' : 0.3},
                                              'recursive': {'estimator' : LinearSVC(), 'n_features_to_select' : 2}},
                   num_feat = 1)
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:numexpr.utils:Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:root:Category encoding


In [8]:
X.columns.index()

AttributeError: 'Index' object has no attribute 'index'

In [10]:
a = 12 if X.shape[1]>2 else 13

In [6]:
f.fit(X,y)
#f.transform(X)

Liblinear failed to converge, increase the number of iterations.


In [7]:
f.transform(X)

Unnamed: 0,Pclass
0,3
1,1
2,3
3,1
4,3
...,...
885,3
886,2
887,1
889,1


In [19]:
#recursive
f.fit(X,y)
f.transform(X)

Liblinear failed to converge, increase the number of iterations.


Unnamed: 0,Pclass,Sex_female
0,3,0
1,1,1
2,3,1
3,1,1
4,3,0
...,...,...
885,3,1
886,2,0
887,1,1
889,1,0


In [15]:
#var
f.fit(X,y)
f.transform(X)

Unnamed: 0,Pclass,Age
0,3,22.0
1,1,38.0
2,3,26.0
3,1,35.0
4,3,35.0
...,...,...
885,3,39.0
886,2,27.0
887,1,19.0
889,1,26.0


In [123]:
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd

class SelectAlgorithm(ABC):
    """
        Abstract class for feature selection algorithms
    """
    def transform(self, df: pd.DataFrame):
        """
        Select features based on fit
        
    	Parameters
    	----------            
        df : pd.DataFrame
             dataframe with features to be selected
                    
    	Returns
    	-------
        pd.DataFrame
        dataframe with selected features only
        """
        return df[df.columns[self.selected_columns]]

    def get_support(self):
        """
        Get a mask, or integer index, of the features selected
        
    	Parameters
    	----------            
                    
    	Returns
    	-------
        np.array     
        """
        return self.selected_columns

    @abstractmethod
    def fit(self) -> None:
        """
        Abstract method that is implemented in classes that inherit it
        """
        pass

class SelectCorrelation(SelectAlgorithm):
    """
        Class to select features based on correlation between features
    """
    def __init__(self, threshold = 0.95):
        self.threshold = threshold
    """
        Constructor

        Parameters
    	----------            
        threshold     : float   
                        correlation threshold
    	Returns
    	-------
        SelectCorrelation
    """
    def fit(self, X: pd.DataFrame, y = None):
        """
        Identify the features to be selected.
        
    	Parameters
    	----------            
        X : pd.DataFrame
             features to be selected

        y : pd.DataFrame
            target values
                    
    	Returns
    	-------
        None
        """
        corr = X.corr()
        self.selected_columns = np.full((corr.shape[0],), True, dtype=bool)
        [self.check_correlation(corr.iloc[i,j],j) for i in range(corr.shape[0]) for j in range(i+1, corr.shape[0])]
        
    def check_correlation(self,corr,j):
        if np.abs(corr) >= self.threshold and self.selected_columns[j]:
            self.selected_columns[j] = False

In [124]:
s = SelectCorrelation(threshold = 0.2)
s.fit(X)
s.transform(X)

Unnamed: 0,Pclass,Sex_female
0,3,0
1,1,1
2,3,1
3,1,1
4,3,0
...,...,...
885,3,1
886,2,0
887,1,1
889,1,0


In [116]:
X.corr()

Unnamed: 0,Pclass,Age,Sex_female,Sex_male
Pclass,1.0,-0.369226,-0.15546,0.15546
Age,-0.369226,1.0,-0.093254,0.093254
Sex_female,-0.15546,-0.093254,1.0,-1.0
Sex_male,0.15546,0.093254,-1.0,1.0


In [24]:

VarianceThreshold(**{'threshold':0.1}).fit_transform(df)

INFO:root:Cleaning data
INFO:root:Category encoding


array([[ 0.,  3., 22.,  0.,  1.],
       [ 1.,  1., 38.,  1.,  0.],
       [ 1.,  3., 26.,  1.,  0.],
       ...,
       [ 1.,  1., 19.,  1.,  0.],
       [ 1.,  1., 26.,  0.,  1.],
       [ 0.,  3., 32.,  0.,  1.]])

In [16]:
p = Preprocessing()

In [7]:
df

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0
...,...,...,...,...
886,0,2,male,27.0
887,1,1,female,19.0
888,0,3,female,
889,1,1,male,26.0


In [12]:
lambda x: df['Age'].values.reshape(-1,1)

array([[22.  ],
       [38.  ],
       [26.  ],
       [35.  ],
       [35.  ],
       [  nan],
       [54.  ],
       [ 2.  ],
       [27.  ],
       [14.  ],
       [ 4.  ],
       [58.  ],
       [20.  ],
       [39.  ],
       [14.  ],
       [55.  ],
       [ 2.  ],
       [  nan],
       [31.  ],
       [  nan],
       [35.  ],
       [34.  ],
       [15.  ],
       [28.  ],
       [ 8.  ],
       [38.  ],
       [  nan],
       [19.  ],
       [  nan],
       [  nan],
       [40.  ],
       [  nan],
       [  nan],
       [66.  ],
       [28.  ],
       [42.  ],
       [  nan],
       [21.  ],
       [18.  ],
       [14.  ],
       [40.  ],
       [27.  ],
       [  nan],
       [ 3.  ],
       [19.  ],
       [  nan],
       [  nan],
       [  nan],
       [  nan],
       [18.  ],
       [ 7.  ],
       [21.  ],
       [49.  ],
       [29.  ],
       [65.  ],
       [  nan],
       [21.  ],
       [28.5 ],
       [ 5.  ],
       [11.  ],
       [22.  ],
       [38.  ],
       [

In [18]:
df = p.clean_data(df)
df = p.categ_encoding(df)

INFO:root:Cleaning data
INFO:numexpr.utils:Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:root:Category encoding


In [19]:
df.head()

Unnamed: 0,Survived,Age,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,0,22.0,0,0,1,0,1
1,1,38.0,1,0,0,1,0
2,1,26.0,0,0,1,1,0
3,1,35.0,1,0,0,1,0
4,0,35.0,0,0,1,0,1


In [86]:
X = df.drop(columns=["Survived","Sex_male"])
#X = df.drop(columns=["Survived"])
y = df["Survived"]

In [87]:
# Ensure the same random state passed to TrainerSkleran().train()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((499, 5), (215, 5), (499,), (215,))

In [92]:
rf = TrainerSklearn().train(X, y, classification=True, 
                            algorithm=DecisionTreeClassifier, 
                            preprocessing=p,
                           data_split=('train_test', {'test_size':.3}),
                           random_state=123)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
INFO:interpret_community.TabularExplainer:Initialized valid explainer TreeExplainer with args {'explain_subset': None, 'features': ['Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female'], 'classes': None}


In [93]:
rf.get_metrics()

{'accuracy': 0.827906976744186,
 'f1': 0.7672955974842767,
 'precision': 0.7530864197530864,
 'recall': 0.782051282051282,
 'roc_auc': 0.8341287666105184}

In [23]:
rf.get_metrics()

{'accuracy': 0.7534883720930232,
 'f1': 0.6708074534161491,
 'precision': 0.6923076923076923,
 'recall': 0.6506024096385542,
 'roc_auc': 0.7930814165753926}

In [12]:
rf.get_columns()

In [13]:
rf.predict_proba(X_test, binary=True)

array([1.        , 0.45271429, 0.43433333, 0.93333333, 0.29      ,
       0.0097619 , 0.08943685, 0.        , 0.0525    , 0.98      ,
       0.65514286, 0.10844703, 0.10844703, 0.97      , 0.87      ,
       0.29193651, 0.01      , 0.42744644, 1.        , 0.0300202 ,
       0.30402381, 1.        , 0.46507143, 0.73488095, 0.56333333,
       1.        , 0.01339054, 1.        , 0.        , 0.35177655,
       0.        , 0.        , 1.        , 0.11603523, 0.        ,
       0.74980952, 0.42744644, 0.13549914, 0.62743681, 0.        ,
       0.055     , 0.        , 0.29346825, 0.02      , 1.        ,
       0.1125    , 0.21      , 1.        , 1.        , 0.36449206,
       0.08943685, 0.38      , 0.        , 0.02      , 1.        ,
       0.42744644, 1.        , 0.        , 0.10844703, 0.00461538,
       1.        , 1.        , 0.08943685, 0.        , 0.49278571,
       0.        , 0.00461538, 1.        , 0.        , 0.42744644,
       0.08943685, 0.02      , 0.17333333, 0.31885714, 0.43383

In [14]:
# Predicting new data
def predict_new(X, model, probs=True):
    X = p.clean_data(X)
    X = p.categ_encoding(X)
    
    columns = model.get_columns()
    for col in columns:
        if col not in X.columns:
            X[col] = 0
    print(X)
    if probs:
        return model.predict_proba(X)
    else:
        return model.predict(X)

In [15]:
new_data = pd.DataFrame({
    'Pclass':3,
    'Sex': 'male',
    'Age':4
}, index=[0])

new_data

Unnamed: 0,Pclass,Sex,Age
0,3,male,4


In [16]:
predict_new(new_data, rf)

INFO:root:Cleaning data
INFO:root:Category encoding


   Age  Pclass_3  Sex_male  Pclass_1  Pclass_2  Sex_female
0    4         1         1         0         0           0


array([[0.68114286, 0.31885714]])

**Get local explainer for each instance:**

In [30]:
# Get local explainer
res = rf.local_interpret(X_test, len(X_test.columns))

In [31]:
res

Unnamed: 0,Importance_Name_0,Importance_Name_1,Importance_Name_2,Importance_Name_3,Importance_Name_4,Importance_Name_5,Importance_Value_0,Importance_Value_1,Importance_Value_2,Importance_Value_3,Importance_Value_4,Importance_Value_5
0,Sex_female,Sex_male,Pclass_3,Pclass_2,Age,Pclass_1,0.235407,0.212511,0.131872,0.049253,-0.019605,-0.028737
1,Sex_male,Sex_female,Age,Pclass_2,Pclass_3,Pclass_1,0.107810,0.105021,0.027756,-0.024243,-0.076479,-0.173282
2,Pclass_3,Pclass_1,Pclass_2,Age,Sex_male,Sex_female,0.136886,0.054153,0.035203,-0.040919,-0.093182,-0.107176
3,Sex_female,Sex_male,Age,Pclass_2,Pclass_1,Pclass_3,0.218018,0.210792,0.206967,-0.012758,-0.035676,-0.073309
4,Sex_male,Sex_female,Age,Pclass_2,Pclass_3,Pclass_1,0.125001,0.124176,0.092246,-0.010514,-0.068209,-0.133400
...,...,...,...,...,...,...,...,...,...,...,...,...
210,Age,Sex_female,Sex_male,Pclass_2,Pclass_3,Pclass_1,0.197178,0.135364,0.130705,-0.002840,-0.014871,-0.026237
211,Sex_female,Sex_male,Pclass_1,Pclass_3,Age,Pclass_2,0.163068,0.161115,0.099112,0.085461,0.072337,-0.000391
212,Sex_female,Age,Pclass_1,Pclass_2,Sex_male,Pclass_3,0.097598,0.095317,0.090221,0.087228,0.079541,-0.030605
213,Pclass_3,Pclass_1,Sex_female,Sex_male,Pclass_2,Age,0.127740,0.054102,0.043235,0.041091,0.036320,-0.113190


In [1]:
import pandas as pd

In [2]:
s = pd.Series(list('abca'))

In [3]:
s

0    a
1    b
2    c
3    a
dtype: object

In [4]:
pd.get_dummies(s)

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


In [None]:
df