### HOW TO USE

```
change data_path in params.yaml
run all of the cells below
you are great!
```


In [1]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/HS/module4-python/final_project/* .


Mounted at /content/drive


### HOW TO EDIT params.yaml
```
data:
  data_path: '/content/penguins_lter.csv'           # path to data
  imputer_type: 'Simple'                            # available types:['Simple', 'Iterative', 'KNN', 'No'] 
  impute_strategy: 'mean'                           # Imputing strategy for SimpleImputer.
  one_hot: 1                                        # 0 - drop categorical, 1 - one-hot them and include 
  feature_list: []                                  # list of features we want to include, if empty => include all

pipeline:
  scaler_type: 'MinMax'                             # available types: ['MinMax', 'Standard']
  classifier: 1                                     # integer from 0 to 4, available classifiers: [0: 'LogisticRegression', 1: 'DecisionTree', 2: 'RandomForest', 3: 'HistGBDT', 4: 'LGBM']
  n_polynomial_features: 2                          # degree of polynomial features. if less than 1 => do not include polynomial features 
  random_state: 42
  n_components: 0                                   #n_components for pca. if 0 => pca is not included
  test_size: 0.2


```                    



In [37]:
!mkdir final_project

mkdir: cannot create directory ‘final_project’: File exists


In [56]:
%%writefile final_project/params.yaml

data:
  data_path: '/content/penguins_lter.csv'            
  imputer_type: 'KNN'                             
  impute_strategy: 'median'                           
  one_hot: 1                                         
  feature_list: []                                 

pipeline:
  scaler_type: 'Standard'                             
  classifier: 1              
  n_polynomial_features: 3                          
  random_state: 42
  n_components: 0
  test_size: 0.2
  params: {}
  


Overwriting final_project/params.yaml


In [36]:
%%writefile final_project/settings.py
import os
import yaml


ROOT_DIR = os.path.join(os.getcwd(), 'final_project') 
BASE_PARAMS_DIR = os.path.join(ROOT_DIR, 'params.yaml') 

with open(BASE_PARAMS_DIR,'r') as fd:
    cfg = yaml.safe_load(fd)

EXPERIMENT_DIR = os.path.join(ROOT_DIR, 'data')
request_number = 1
PATH = os.path.join(EXPERIMENT_DIR, f'exp{request_number}')

while os.path.exists(PATH):
    request_number +=1
    PATH = os.path.join(EXPERIMENT_DIR, f'exp{request_number}')

DATA_PATH = PATH
PARAMS_DIR = os.path.join(DATA_PATH, 'experiment_params.yaml')


Overwriting final_project/settings.py


In [38]:
%%writefile final_project/utils.py 
import pandas as pd
import yaml
import numpy as np
from settings import BASE_PARAMS_DIR
from data_transforms import DummyTransformer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator, IterativeImputer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA 
from lightgbm import LGBMClassifier





def preprocess_data(BASE_PARAMS_DIR=BASE_PARAMS_DIR):
  
    with open(BASE_PARAMS_DIR,'r') as fd:
      cfg = yaml.safe_load(fd)
    
    feature_list = cfg.get('data')['feature_list']
    cat_cols = ['species', 'island']    #categorical columns. will drop them, if one_hot == 0, or will do one_hot encodings if one_hot == 1
    
    drop_cols = ['studyName',         #this columns will be dropped in any ways... (useless/noninformative)
                'Sample Number',
                'Region',             #single unique value for all datapoints 
                'Individual ID',
                'Comments',
                'Date Egg',
                'Stage']              #single unique value for all datapoints

    
    # preprocessing data
    data = pd.read_csv(cfg.get('data')['data_path'])
    data = data.drop(drop_cols, axis=1)             
    data.rename(columns = {'Delta 15 N (o/oo)': 'delta15n',
                          'Delta 13 C (o/oo)': 'delta13c',
                          'Culmen Length (mm)': 'culmen_len',
                          'Culmen Depth (mm)': 'culmen_dep',
                          'Flipper Length (mm)': 'flipper_len',
                          'Body Mass (g)': 'mass',
                          'Sex': 'target',}, inplace = True)

    data.columns = map(str.lower, data.columns)
    data['clutch completion'] = data['clutch completion'].map({'Yes': 1,
                                                                'No': 0})
    
    data['target'] = data['target'].map({'MALE':1,
                                        'FEMALE':0})
    #taking specified subset of features written in params.yaml
    if len(cfg.get('data')['feature_list'])>1:
      data = data[feature_list]

    #checking if we want to process categorical features
    if cfg.get('data')['one_hot'] == 1:
      one_hots = pd.get_dummies(data[cat_cols])
      data = pd.concat([data, one_hots],axis=1)
    
    data = data.drop(cat_cols, axis=1)

    if cfg.get('data')['imputer_type'] == 'No':
      data = data.dropna()
    
    data = data.dropna(subset = ['target'])

    return data

def impute_data(data, BASE_PARAMS_DIR=BASE_PARAMS_DIR):

    with open(BASE_PARAMS_DIR,'r') as fd:
        cfg = yaml.safe_load(fd)
    data = data.copy()
    cols = data.columns.values
    imp_strategy = cfg.get('data')['impute_strategy']
    imputer_type = cfg.get('data')['imputer_type']
    random_state = cfg.get('pipeline')['random_state']
    if imputer_type == 'Simple':
      imputer = SimpleImputer(missing_values=np.nan, strategy=imp_strategy)

    if imputer_type == 'Iterative':
      imputer = IterativeImputer(missing_values=np.nan, random_state=random_state)

    if imputer_type == 'KNN':
      imputer = KNNImputer()

    if imputer_type == 'No':
      imputer = DummyTransformer()

    else:
      imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    
    data = pd.DataFrame(imputer.fit_transform(data),
                        columns = cols)
    return data

def make_pipeline(BASE_PARAMS_DIR=BASE_PARAMS_DIR):
    with open(BASE_PARAMS_DIR,'r') as fd:
        cfg = yaml.safe_load(fd)


    scaler_type = cfg.get('pipeline')['scaler_type']
    degree = cfg.get('pipeline')['n_polynomial_features']
    clf = cfg.get('pipeline')['classifier']
    n_components = cfg.get('pipeline')['n_components']
    random_state = cfg.get('pipeline')['random_state']

    
    params = cfg.get('pipeline')['params']
    params['random_state'] = random_state

    #determining scaler
    if scaler_type == 'MinMax':
      scaler = MinMaxScaler()

    if scaler_type == 'Standard':
      scaler = StandardScaler()  

    #determining polynomial degree
    if degree <2:
      polynomial_features = DummyTransformer()

    else:
      polynomial_features = PolynomialFeatures(degree=degree) 

    #determining pca
    if n_components > 0:
      pca = PCA(n_components, random_state=random_state)

    else:
      pca = DummyTransformer()



    #determining classifier
    if clf == 0:
      clf = LogisticRegression(**params)

    elif clf == 1:
      clf = DecisionTreeClassifier(**params)

    elif clf == 2:
      clf = RandomForestClassifier(**params)

    elif clf == 3:
      clf = HistGradientBoostingClassifier(**params)

    elif clf == 4:
      clf = LGBMClassifier(**params)



    pipe = Pipeline([
        ('polynomial-features', polynomial_features),
        ('scaler', scaler),
        ('pca', pca),
        ('classifier', clf),
    ])

    return pipe


Overwriting final_project/utils.py


In [39]:
%%writefile final_project/data_transforms.py

from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class DummyTransformer(TransformerMixin):
    def fit(self, X, y=None):
      return self

    def transform(self, X, y=None):
      return X


Overwriting final_project/data_transforms.py


In [30]:
%%writefile final_project/train.py
import pandas as pd
import yaml
import numpy as np
import joblib
import os

from settings import BASE_PARAMS_DIR, DATA_PATH, PARAMS_DIR, EXPERIMENT_DIR
from utils import preprocess_data, impute_data, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score, adjusted_mutual_info_score, roc_auc_score, log_loss, f1_score

if __name__ == '__main__':
    with open(BASE_PARAMS_DIR,'r') as fd:
      cfg = yaml.safe_load(fd)

    if not os.path.exists(EXPERIMENT_DIR):
      os.mkdir(EXPERIMENT_DIR)

    if not os.path.exists(DATA_PATH):
      os.mkdir(DATA_PATH)

    random_state = cfg.get('pipeline')['random_state']
    test_size = cfg.get('pipeline')['test_size']

    data = preprocess_data()
    data = impute_data(data)
    pipe = make_pipeline()

    train_data, test_data = train_test_split(data, test_size=test_size)
    X_train, y_train = train_data.drop('target',axis=1), train_data.target
    X_val, y_val = test_data.drop('target',axis=1), test_data.target
    print(pipe)

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    preds_proba = pipe.predict_proba(X_val)

    metrics_dict = {
    'precision_score': float(precision_score(y_pred=preds, y_true=y_val)),
    'recall_score': float(recall_score(y_pred=preds, y_true=y_val)),
    'accuracy_score': float(accuracy_score(y_pred=preds, y_true=y_val)),
    'adjusted_mutual_info_score': float(adjusted_mutual_info_score(y_val, preds)),
    'roc_auc_score': float(roc_auc_score(y_val, preds_proba[:,1])),
    'log_loss': float(log_loss(y_val, preds_proba[:,1])),
    'f1_score': float(f1_score(y_pred=preds, y_true=y_val)),
    }
    
    with open(DATA_PATH + '/report.yaml', 'w') as f:
       yaml.dump(metrics_dict, f, default_flow_style=False)
    
    joblib.dump(pipe, DATA_PATH + '/model.pkl')

    with open(PARAMS_DIR, 'w') as f:
      documents = yaml.dump(cfg, f)

    print(f'results saved at {DATA_PATH}')

    
    



Overwriting final_project/train.py


In [57]:
!python final_project/train.py

Pipeline(steps=[('polynomial-features', PolynomialFeatures(degree=3)),
                ('scaler', StandardScaler()),
                ('pca',
                 <data_transforms.DummyTransformer object at 0x7f9289e1aa00>),
                ('classifier',
                 DecisionTreeClassifier(max_depth=4, random_state=42))])
results saved at /content/final_project/data/exp14


In [58]:
!zip -r /content/final_project.zip /content/final_project

  adding: content/final_project/ (stored 0%)
  adding: content/final_project/data/ (stored 0%)
  adding: content/final_project/data/exp1/ (stored 0%)
  adding: content/final_project/data/exp1/model.pkl (deflated 52%)
  adding: content/final_project/data/exp1/report.yaml (deflated 37%)
  adding: content/final_project/data/exp1/experiment_params.yaml (deflated 32%)
  adding: content/final_project/data/exp10/ (stored 0%)
  adding: content/final_project/data/exp10/model.pkl (deflated 82%)
  adding: content/final_project/data/exp10/report.yaml (deflated 40%)
  adding: content/final_project/data/exp10/experiment_params.yaml (deflated 32%)
  adding: content/final_project/data/exp4/ (stored 0%)
  adding: content/final_project/data/exp4/model.pkl (deflated 27%)
  adding: content/final_project/data/exp4/report.yaml (deflated 40%)
  adding: content/final_project/data/exp4/experiment_params.yaml (deflated 31%)
  adding: content/final_project/data/exp3/ (stored 0%)
  adding: content/final_project/d