# Notebook example

Installing some necessary packages:

In [1]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!jupyter labextension install @jupyter-widgets/jupyterlab-manager



Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m
Traceback (most recent call last):
  File "/home/rodrigo/.local/bin/jupyter-labextension", line 7, in <module>
    from jupyterlab.labextensions import main
ModuleNotFoundError: No module named 'jupyterlab'


In [4]:
!pip install xgboost
!pip install scikit-learn==0.24



**It is necessary to change the working directory so the project structure works properly:**

In [1]:
import sys
sys.path.append("../../")

From this point, it's on you!

---

In [16]:
import pandas as pd
import numpy as np

from ml.data_source.spreadsheet import Spreadsheet
from ml.preprocessing.preprocessing import Preprocessing
from ml.preprocessing.feature_selection import FeatureSelector
from ml.model.trainer import TrainerSklearn
from ml.preprocessing.normalization import Normalizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.svm import LinearSVC
from sklearn.feature_selection import mutual_info_classif 
from skfeature.function.similarity_based import fisher_score 
from sklearn.feature_selection import chi2

In [3]:
df = Spreadsheet().get_data('../../../data/raw/train.csv',columns=['Survived','Pclass','Sex','Age'])

In [None]:
p = Preprocessing()
estimator = LinearSVC()
#f = FeatureSelector('exaustive', estimator = estimator, min_features = 3, max_features = 4)
#f = FeatureSelector('recursive', estimator = LinearSVC(), n_features_to_select=2)
#f = FeatureSelector('sequential', estimator = LinearSVC(), direction='forward')
#f = FeatureSelector('model', estimator = estimator)
#f = FeatureSelector('variance', threshold=0.5)
#f = FeatureSelector('correlation', threshold=0.9)
#f = FeatureSelector('univariate_kbest', score_func=FeatureSelector.mean_abs_diff, k=2)
#f = FeatureSelector('univariate_percentile', score_func=chi2, percentile=50)
#f = FeatureSelector('coefficients', model=estimator, num_feat = 2)
f = FeatureSelector('ensemble', dic_selection={ 'variance': {'threshold' : 0.3},
                                              'recursive': {'estimator' : LinearSVC(), 'n_features_to_select' : 2}},
                   num_feat = 1)
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

## Variance Threshold Example

In [6]:
p = Preprocessing()
f = FeatureSelector('variance', threshold=0.5)
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [7]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Pclass,Age
0,3,22.0
1,1,38.0
2,3,26.0
3,1,35.0
4,3,35.0
...,...,...
885,3,39.0
886,2,27.0
887,1,19.0
889,1,26.0


## Information Gain Example

In [8]:
p = Preprocessing()
f = FeatureSelector('univariate_kbest', score_func=mutual_info_classif, k=2)
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [9]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
885,1,0
886,0,1
887,1,0
889,0,1


## Chi-square Example (k best)

In [10]:
p = Preprocessing()
f = FeatureSelector('univariate_kbest', score_func=chi2, k=2)
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [11]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
885,1,0
886,0,1
887,1,0
889,0,1


## Chi-square Example (percentile)

In [35]:
p = Preprocessing()
f = FeatureSelector('univariate_percentile', score_func=chi2, percentile=10)
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [36]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Sex_female
0,0
1,1
2,1
3,1
4,0
...,...
885,1
886,0
887,1
889,0


## Fisher's Score Example

In [12]:
p = Preprocessing()
f = FeatureSelector('univariate_kbest', score_func=fisher_score.fisher_score, k=2)
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [13]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
885,1,0
886,0,1
887,1,0
889,0,1


## Correlation Coefficient Example

In [14]:
p = Preprocessing()
f = FeatureSelector('correlation', threshold=0.9)
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [15]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Pclass,Age,Sex_female
0,3,22.0,0
1,1,38.0,1
2,3,26.0,1
3,1,35.0,1
4,3,35.0,0
...,...,...,...
885,3,39.0,1
886,2,27.0,0
887,1,19.0,1
889,1,26.0,0


## Mean Absolute Difference (MAD) Example

In [16]:
f = FeatureSelector('univariate_kbest', score_func=FeatureSelector.mean_abs_diff, k=2)
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [17]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Pclass,Age
0,3,22.0
1,1,38.0
2,3,26.0
3,1,35.0
4,3,35.0
...,...,...
885,3,39.0
886,2,27.0
887,1,19.0
889,1,26.0


## Dispersion ratio Example

In [4]:
f = FeatureSelector('univariate_kbest', score_func=FeatureSelector.disp_ratio, k=2)
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:numexpr.utils:Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:root:Category encoding


In [5]:
f.fit(X,y)
f.transform(X)

overflow encountered in reduce
divide by zero encountered in true_divide


Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
885,1,0
886,0,1
887,1,0
889,0,1


## Forward Feature Selection Example

In [6]:
f = FeatureSelector('sequential', estimator = DecisionTreeClassifier(), direction='forward')
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [7]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Pclass,Sex_female
0,3,0
1,1,1
2,3,1
3,1,1
4,3,0
...,...,...
885,3,1
886,2,0
887,1,1
889,1,0


## Backward Feature Selection Example

In [8]:
f = FeatureSelector('sequential', estimator = DecisionTreeClassifier(), direction='backward')
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [9]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Pclass,Sex_male
0,3,1
1,1,0
2,3,0
3,1,0
4,3,1
...,...,...
885,3,0
886,2,1
887,1,0
889,1,1


## Exaustive Feature Selection Example

In [12]:
f = FeatureSelector('exaustive', estimator = DecisionTreeClassifier(), min_features = 2, max_features = 4)
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [13]:
f.fit(X,y)
f.transform(X)

Features: 11/11

Unnamed: 0,Pclass,Age,Sex_female
0,3,22.0,0
1,1,38.0,1
2,3,26.0,1
3,1,35.0,1
4,3,35.0,0
...,...,...,...
885,3,39.0,1
886,2,27.0,0
887,1,19.0,1
889,1,26.0,0


## Recursive Feature Elimination Example

In [14]:
f = FeatureSelector('recursive', estimator = LinearSVC(), n_features_to_select=2)
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [15]:
f.fit(X,y)
f.transform(X)

Liblinear failed to converge, increase the number of iterations.


Unnamed: 0,Pclass,Sex_female
0,3,0
1,1,1
2,3,1
3,1,1
4,3,0
...,...,...
885,3,1
886,2,0
887,1,1
889,1,0


## LASSO Regularization (L1) Example

In [21]:
estimator = LogisticRegression(C=1, penalty='l1', solver='liblinear')
f = FeatureSelector('model', estimator = estimator)
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [22]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Pclass,Age,Sex_female
0,3,22.0,0
1,1,38.0,1
2,3,26.0,1
3,1,35.0,1
4,3,35.0,0
...,...,...,...
885,3,39.0,1
886,2,27.0,0
887,1,19.0,1
889,1,26.0,0


## Random Forest Importance Example

In [23]:
f = FeatureSelector('model', estimator = RandomForestClassifier())
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [24]:
f.fit(X,y)
f.transform(X)

Unnamed: 0,Age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
...,...
885,39.0
886,27.0
887,19.0
889,26.0


## Coefficients Example

In [29]:
f = FeatureSelector('coefficients', model=LinearSVC(), num_feat = 2)
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [30]:
f.fit(X,y)
f.transform(X)

Liblinear failed to converge, increase the number of iterations.


Unnamed: 0,Pclass,Sex_female
0,3,0
1,1,1
2,3,1
3,1,1
4,3,0
...,...,...
885,3,1
886,2,0
887,1,1
889,1,0


## Ensemble Example

In [37]:
f = FeatureSelector('ensemble', dic_selection={ 'variance': {'threshold' : 0.3},
                                              'recursive': {'estimator' : LinearSVC(), 'n_features_to_select' : 2}},
                   num_feat = 1)
p = Preprocessing()
df = p.clean_data(df)
y = df['Survived']
X = df.drop(columns=['Survived'])
X = p.categ_encoding(X)

INFO:root:Cleaning data
INFO:root:Category encoding


In [38]:
f.fit(X,y)
f.transform(X)

Liblinear failed to converge, increase the number of iterations.


Unnamed: 0,Pclass
0,3
1,1
2,3
3,1
4,3
...,...
885,3
886,2
887,1
889,1
