# Please use this notebook to load in your own data to test out the preprocess helper!

In [1]:
from preprocess_help_DEV import DataPreProcess

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import pickle
import seaborn as sns

#### Load your data

In [2]:
#my_df = pd.read_csv("PUT YOUR FILENAME HERE") # LOAD YOUR DATA HERE!
my_df = sns.load_dataset('penguins')
print(my_df.shape)
my_df.head(3)

(344, 7)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female


#### Initialize the class

In [3]:
preprocessor = DataPreProcess() # initialize the class

#### Use the "process_columns" method to step through each column and decide what to do with it
options are: ['skip', 'id', 'response', 'float', 'category', 'binary', 'ordinal', 'other']
* **skip**: this is for columns you don't want to use in your pipeline or modeling
* **id**: id columns will not be used in your pipeline TODO: these are meant to be saved for use later if needed to match back to original data.
* **response**: if chosen you'll have the option to keep the response as a float, string, or one-hot encode this TODO: do the transformation and return in another method
* **float**: if chosen you will have the option to view what the yeo-johnson transformation looks like and then choose if you want this column transformed in the preprocessing pipeline
* **category**: this will one-hot encode the column in the pipeline. TODO: allow option to keep as string (ex when using a tree-based method that allows categories)
* **binary**: this will treat the column like a normal float column. TODO: are there any special treatments for binary variables?
* **ordinal**: at the moment it will treat it like a float. TODO: look into any special treatments for ordinal variables?
* **other**: this will ignore the column in the pipeline. TODO: let the data scientist review the 'other' columns again later

In [4]:
preprocessor.process_columns(my_df)

_____________________________
Pre-processing selection

100%	[[5;37;42m                                                  [0m] 7/7 columns


*** sex *** 


count      333
unique       2
top       Male
freq       168
Name: sex, dtype: object
[5;30;42m11 (3%) number of NAs[0m
sex will be used as a categorical feature

DONE :)
***********


In [None]:
# process or update a single column. need to pass in df again
preprocessor.process_columns(my_df, single_col='species')

In [5]:
preprocessor.skew_transformer

{'bill_length_mm': 'yeojohnson',
 'bill_depth_mm': 'box-cox',
 'flipper_length_mm': 'none',
 'body_mass_g': 'none'}

#### Create the pipeline. this will create the pipeline framework that fits the decisions made above

In [14]:
preprocessor.create_pipeline(my_df) # this will create the pipeline with the steps that fit your decisions above

#### Fit the pipeline with data. If no dataframe is passed, it will use the original dataframe seen above. Or you can do your test/train split and then pass your training dataframe to fit the pipeline. 

As long as the columns have the same name, there is no need to specify which columns to keep because you've already decided what to do with them. You can pass the whole dataframe

In [15]:
from sklearn.model_selection import train_test_split

y = preprocessor.return_response_values(my_df) # use the class to return the response column you specified. if not specified, just prints message

X_train, X_test, y_train, y_test = train_test_split(my_df, y, test_size=0.33, random_state=42)

In [16]:
preprocessor.float_cols

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

In [17]:
#preprocessor.fit_pipeline() # fit the pipeline. if no dataframe is passed it will fit on the orinigal dataframe
preprocessor.fit_pipeline(X_train)

[Pipeline]  (step 1 of 7) Processing numeric_capping_quant, total=   0.0s
[Pipeline]  (step 2 of 7) Processing numeric_imputation, total=   0.0s
[Pipeline]  (step 3 of 7) Processing category_null_value, total=   0.0s
[Pipeline]  (step 4 of 7) Processing rare_level_grouping, total=   0.0s
[Pipeline] .. (step 5 of 7) Processing one_hot_encoding, total=   0.0s
[Pipeline]  (step 6 of 7) Processing float transformers, total=   0.0s
[Pipeline] ............ (step 7 of 7) Processing minmax, total=   0.0s




#### You can now return the pipeline to be used later

In [18]:
my_pipe = preprocessor.return_pipeline()
my_pipe

#### You can also use a fitted pipeline to transform new data.

Again, you don't have to clean up the dataframe, you've decided what columns you want to keep and as long as all of those columns are present in your new dataframe it will transform the data

In [19]:
transform_test_df = preprocessor.transform_new_data(X_test)
transform_test_df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_Female,sex_Male,sex_rare
194,0.992518,0.910420,0.314286,0.144550,0.0,1.0,0.0,0.0,1.0,0.0
157,0.602892,0.635488,0.371429,0.334123,0.0,1.0,0.0,1.0,0.0,0.0
225,0.692099,0.000000,0.714286,0.618483,1.0,0.0,0.0,1.0,0.0,0.0
208,0.602892,0.403386,0.171429,0.002370,0.0,1.0,0.0,1.0,0.0,0.0
318,0.822105,0.030579,0.514286,0.654028,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
126,0.160449,0.595370,0.171429,0.014218,0.0,0.0,1.0,1.0,0.0,0.0
282,0.637228,0.000000,0.828571,0.547393,1.0,0.0,0.0,1.0,0.0,0.0
59,0.076838,0.910420,0.257143,0.239336,1.0,0.0,0.0,0.0,1.0,0.0
111,0.630363,1.000000,0.171429,0.642180,1.0,0.0,0.0,0.0,1.0,0.0


In [12]:
pd.Series(y).value_counts()

Adelie       152
Gentoo       124
Chinstrap     68
dtype: int64

#### Finally, you can save the fitted preprocessor in a pickle file to be loaded later

In [13]:
# to save fitted class as pickle
with open('my_preprocesser.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)
    
# to load pickled preprocessor later
#with open('my_preprocesser.pkl', 'rb') as file:
#    preprocessor = pickle.load(file)

In [None]:
# to load pickled preprocessor later
with open('my_preprocesser.pkl', 'rb') as file:
    preprocessor = pickle.load(file)

In [None]:
# dev work
preprocessor.float_cols

In [None]:
from IPython.display import clear_output

def select_caps(df, list_cols):
    quant_decision_dict = {}
    val_decision_dict = {}
    for c in list_cols:
        clear_output(wait=True)
        temp_vals = df[c].dropna()
        min_val = np.min(temp_vals)
        max_val = np.max(temp_vals)
        bot_5 = np.percentile(temp_vals, 5)
        bot_10 = np.percentile(temp_vals, 10)
        top_95 = np.percentile(temp_vals, 95)
        top_90 = np.percentile(temp_vals, 90)

        plt.hist(temp_vals, bins=40)
        plt.axvline(x=bot_5, color='b', ls='--', label='5th percentile: {}'.format(bot_5))
        plt.axvline(x=bot_10, color='r', ls='-', label='10th percentile: {}'.format(bot_10))
        plt.axvline(x=top_90, color='purple', ls='-.', label='90th percentile: {}'.format(top_90))
        plt.axvline(x=top_95, color='orange', ls=':', label='95th percentile: {}'.format(top_95))
        plt.xticks(rotation=45, ha="right")
        plt.legend(bbox_to_anchor=(1.0, 1), loc='upper left')
        plt.title(c)
        plt.show()
        cap_decision_lower = float(input("Input lower cap (percent or value): "))
        cap_decision_upper = float(input("Input upper cap (percent of value): "))
        assert cap_decision_lower < cap_decision_upper, "upper bound can't be smaller than lower bound"

        if (cap_decision_lower < 0) or (cap_decision_lower > 1) or (cap_decision_upper < 0) or (cap_decision_upper > 1):
            val_decision_dict[c] = [float(cap_decision_lower), float(cap_decision_upper)]
        elif ((min_val >= 0) and (min_val < 1)) and ((max_val >= 0) and (max_val <= 1)):
            val_or_quant = 'nothing yet'
            while val_or_quant not in ['quantiles', 'values']:
                val_or_quant = input("Are cap decisions 'quantiles' or 'values'?")
            if val_or_quant == 'quantiles':
                quant_decision_dict[c] = [cap_decision_lower, cap_decision_upper]
            else: 
                val_decision_dict[c] = [cap_decision_lower, cap_decision_upper]
        else:
            quant_decision_dict[c] = [cap_decision_lower, cap_decision_upper]

    if quant_decision_dict == {}:
        quant_decision_dict = None
    if val_decision_dict == {}:
        val_decision_dict = None

    return quant_decision_dict, val_decision_dict

select_caps(my_df, preprocessor.float_cols)
