# A. Programming with Python
Some examples to get you started with programming concepts and Python in particular.

In [None]:
print("hello wooorldddddd")

## Libraries

In [None]:
from numpy.random import rand # function straight from submodule
rand(5) # five random numbers between 0 and 1

In [None]:
import numpy.random as npr # load submodule
npr.rand(5)

In [None]:
import numpy as np # load main NumPy module
np.random.rand(5)

## Variables & data structures

In [None]:
session = 2
type(session)

In [None]:
session += 1
print(session)

### Lists

In [None]:
list_of_strategies = ["up-sell", "cross-sell", "down-sell"]

In [None]:
len(list_of_strategies)

In [None]:
list_of_strategies[0] # access through indexing (first element is at index 0)

In [None]:
list_of_strategies.append("stay put")

In [None]:
list_of_strategies

### Dictionaries

In [None]:
dict_of_participants = { # collection of key-value pairs
    "Jan": ("XYZ", 42),
    "Sam": ("ABC", 28),
    "Daphne": ("MNO", 35)
}

In [None]:
dict_of_participants["Sam"] # you can only access the data via a key

## For loops & conditional statements

In [None]:
for strategy in list_of_strategies:
    print("Possible strategy:", strategy)

In [None]:
for participant in dict_of_participants:
    print("name:", participant)
    if participant == "Daphne":
        value = dict_of_participants[participant]
        age = value[1]
        print("   Age:", age)
    elif participant == "Jan":
        value = dict_of_participants[participant]
        company = value[0]
        print("   Company:", company)
    else:
        print("   We do not want to know your info.")

## Functions & parameters

In [None]:
def get_age(participants, who):
    """
    Parameters
    ------------
    - participants : dict
        Dictionary of participants
        
    - who : str
        Name of participant to get age from, as a string
        Must be a valid key of 'participants' argument
    
    Returns
    ------------
    Age of selected participant
    """
    value = participants[who]
    age = value[1]
    
    if age < 20:
        print("you are young")
    else:
        print("you are old")
    
    return age

In [None]:
get_age(dict_of_participants, "Sam")

## Rectangular data & filtering

In [None]:
import pandas as pd

In [None]:
data = {
    "Artist": ["Billy Holiday", "Jimi Hendrix", "Miles Davis", "SIA"],
    "Genre": ["Jazz", "Rock", "Jazz", "Pop"],
    "Listeners": [1300000, 2700000, 1500000, 2000000],
    "Plays": [27000000, 70000000, 48000000, 74000000]
}

df = pd.DataFrame(data)

df

In [None]:
df.mean() # other available functions: https://pandas.pydata.org/docs/reference/frame.html

In [None]:
df.max()

In [None]:
df["avg_plays"] = df.Plays/df.Listeners

In [None]:
df

In [None]:
df["avg_plays"].plot()

In [None]:
df.set_index("Artist")["Plays"].plot(ylabel="Total plays")

In [None]:
# alternative way to create your df
data2 = [["Billy Holiday", "Jazz", 1300000, 27000000],
         ["Jimi Hendrix", "Rock", 2700000, 70000000],
         ["Miles Davis", "Jazz", 1500000, 48000000],
         ["SIA", "Pop", 2000000, 74000000]]

df2 = pd.DataFrame(data2, columns = ["Artist", "Genre", "Listeners", "Plays"])

df2

In [None]:
df_jazz = df[df["Genre"] == "Jazz"]

df_jazz

In [None]:
df_popular = df[df["Listeners"] >= 2000000]

df_popular

## Object-oriented programming (OOP)

In [None]:
class Rectangle:
    def __init__(self, length, breadth, unit_cost=0):
        self.length = length
        self.breadth = breadth
        self.unit_cost = unit_cost
    def get_perimeter(self):
        return 2 * (self.length + self.breadth)
    def get_area(self):
        return self.length * self.breadth
    def calculate_cost(self):
        area = self.get_area()
        return area * self.unit_cost

In [None]:
r = Rectangle(160, 120, unit_cost=2000)

print("Area of rectangle: %s cm^2" % (r.get_area()))
print("Cost of rectangular field: EUR%s " %(r.calculate_cost()))

## A few exercises you can have a go at if you feel confident...

1. Write a function that takes two lists and outputs them as two named columns of a DataFrame
2. Compute at least two other summary statistics from the df variable (Google is your friend)
3. Make a class Customer and add some init variables and functions (no need to fill in the functions, just write keyword 'pass' under the function name)

# B. Predictive modelling with Cobra

Cobra is a Python package for rapid development of predictive models. Cobra focuses on interpretability and its methodology is based on Python Predictions' long experience with statistical modelling.

How to install Cobra?

  * install the package `pip install -U pythonpredictions-cobra` and you are good to go!

In [None]:
# settings --> switch internet option on (requires SMS verification)
!pip install -U pythonpredictions-cobra

In [None]:
import pandas as pd
import json
import warnings
from pathlib import Path

# preprocessing
from cobra.preprocessing import PreProcessor

# feature preselection
from cobra.model_building import univariate_selection
from cobra.evaluation import plot_univariate_predictor_quality
from cobra.evaluation import plot_correlation_matrix

# modelling
from cobra.model_building import ForwardFeatureSelection
from cobra.evaluation import plot_performance_curves
from cobra.evaluation import plot_variable_importance

# evaluation & PIGs
from cobra.evaluation import Evaluator
from cobra.evaluation import generate_pig_tables
from cobra.evaluation import plot_incidence

# Pandas settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# suppress warnings
warnings.filterwarnings('ignore')

# PROJECT DEFINITION

Predict whether income exceeds $50k/year based on U.S. census data.

Dataset which will be used:

  * Survey of adults and their earnings
  * Target variable: 
    * 1 = income > 50k USD
    * 0 = income <= 50k USD
  * Source: https://archive.ics.uci.edu/ml/datasets/Adult

# DATA PREPARATION

## Load data

In [None]:
ROOT = Path.cwd()
ROOT

In [None]:
pth_to_data = '../input/earnings-dataset/earnings_dataset.csv'
df = pd.read_csv(pth_to_data, sep=';')

df.head(n=5)

In [None]:
len(df) # number of rows (= observations)

## Preprocessing

The first part focuses on preparing the predictors into an **analytical basetable (ABT)** for modelling by:

  * Splitting the dataset into training, selection and validation datasets.
  * Binning continuous variables into discrete intervals.
  * Replacing missing values of both categorical and continuous variables (which are now binned) with an additional "Missing" bin/category.
  * Regrouping categories in new category "other".
  * Replacing bins/categories with their corresponding incidence rate per category/bin.


### General structure

##### Create instance of PreProcessor object
`preprocessor = PreProcessor.from_params(parameters)`
        
##### Split data into train-selection-validation sets
`basetable = preprocessor.train_selection_validation_split(data)`
                
##### Fit the pipeline
`basetable = preprocessor.fit(basetable)`

##### Transform the data
`basetable = preprocessor.transform(basetable)`                  

In [None]:
# create instance of PreProcessor object from parameters
preprocessor = PreProcessor.from_params(
        n_bins=10,
        strategy='quantile',
        serialization_path=ROOT/'pipeline.json')

In [None]:
# split data into train-selection-validation sets
basetable = preprocessor.train_selection_validation_split(
                data=df,
                target_column_name='TARGET',
                train_prop=0.8,
                selection_prop=0.1,
                validation_prop=0.1)

basetable.head(n=5)

In [None]:
# we need to create a list of variables by their datatype
continuous_vars = ['age', 'education-num', 'capital-gain',
                   'capital-loss', 'hours-per-week']

discrete_vars = ['workclass', 'fnlwgt', 'education',
                 'marital-status', 'occupation',
                 'relationship', 'race', 'sex',
                 'native-country']

target_column_name = 'TARGET'

In [None]:
# fit the pipeline
preprocessor.fit(basetable[basetable['split']=='train'],
                 continuous_vars=continuous_vars,
                 discrete_vars=discrete_vars,
                 target_column_name=target_column_name)

# transform the data (e.g. perform discretisation, incidence replacement, ...)
basetable = preprocessor.transform(basetable,
                                   continuous_vars=continuous_vars,
                                   discrete_vars=discrete_vars)                        

In [None]:
basetable.head(n=5)

# MODEL BUILDING

## Feature preselection
Once we have the data prepared, we need to select the right variables. Thus, we perform a univariate preselection to rule out any predictor with little to no predictive power.

This preselection is based on an AUC threshold of a univariate model on the train and selection datasets.

We select all variables with `preselect_auc_threshold` > 0.55 and to avoid overfitting, we drop all variables where _(auc_train - auc_selection) >= 0.05_.



### General structure
  
##### Run univariate preselection procedure and plot output
`df_auc = univariate_selection.compute_univariate_preselection(basetable, thresholds)`

`plot_univariate_predictor_quality(df_auc)`

##### Get a list of predictors selected by the univariate selection
`preselected_predictors = univariate_selection.get_preselected_predictors(df_auc)`   

##### Compute and plot correlations between preprocessed predictors
`df_corr = univariate_selection.compute_correlations(basetable)`

`plot_correlation_matrix(df_corr)`

In [None]:
preprocessed_predictors = [col for col in basetable.columns.tolist() if '_enc' in col]

df_auc = univariate_selection.compute_univariate_preselection(
    target_enc_train_data=basetable[basetable['split']=='train'],
    target_enc_selection_data=basetable[basetable['split']=='selection'],
    predictors=preprocessed_predictors,
    target_column=target_column_name,
    preselect_auc_threshold=0.55,     
    preselect_overtrain_threshold=0.05)

# get a list of predictors selected by the univariate selection
preselected_predictors = univariate_selection.get_preselected_predictors(df_auc)   

In [None]:
# univariate feature importance
plot_univariate_predictor_quality(df_auc)

In [None]:
# compute correlations between preprocessed predictors
df_corr = (univariate_selection
           .compute_correlations(basetable[basetable['split']=='train'],
                                 preprocessed_predictors))

plot_correlation_matrix(df_corr)

## Forward feature selection
After having preselected the features, we can start modelling using forward feature selection.

Since we use target encoding on all our predictors, we will only consider models with positive coefficients (no sign flip should occur) as this makes the model more interpretable.


### General structure
  
##### Initialize forward feature selection procedure
`forward_selection = ForwardFeatureSelection(parameters)`

`forward_selection.fit(basetable)`

##### Run forward feature selection and plot performance curves
`performances = forward_selection.compute_model_performances(basetable, target_column_name)`

`plot_performance_curves(performances)`

##### Select and extract model of choice
`model = forward_selection.get_model_from_step()`

`final_predictors = model.predictors`

##### Compute and plot the importance of each predictor in the model
`variable_importance = model.compute_variable_importance(basetable)`
  
`plot_variable_importance(variable_importance)`

In [None]:
forward_selection = ForwardFeatureSelection(max_predictors=30, pos_only=True)

forward_selection.fit(basetable[basetable['split']=='train'],
                      target_column_name,
                      preselected_predictors)

performances = forward_selection.compute_model_performances(basetable, target_column_name)

In [None]:
# plot performance curves
plot_performance_curves(performances)

In [None]:
# after plotting the performances we select our model of choice (watch out: 0-based indexing)
model = forward_selection.get_model_from_step(4)

# we have chosen model with 5 variables, which we extract as follows
final_predictors = model.predictors

In [None]:
# we can also compute and plot the importance of each predictor in the model
variable_importance = model.compute_variable_importance(basetable[basetable['split']=='selection'])

# this is the correlation of the model score and each predictor    
plot_variable_importance(variable_importance)

# MODEL VALIDATION

## Evaluation

The next step after modelling is evaluating how well our model is performing.

### General structure
  
##### Instantiate Evaluator object
`evaluator = Evaluator()`

##### Automatically find the best cut-off probability
`evaluator.fit()`

##### Get and plot various scalar metrics
`evaluator.scalar_metrics`

`evaluator.plot_confusion_matrix()`

`evaluator.plot_roc_curve()`

`...`

In [None]:
# get numpy array of True target labels and predicted scores
y_true = basetable[basetable['split']=='validation'][target_column_name].values
y_pred = model.score_model(basetable[basetable['split']=='validation'])

evaluator = Evaluator()

# automatically find the best cut-off probability
evaluator.fit(y_true, y_pred)

In [None]:
# get various scalar metrics such as accuracy, AUC, precision, recall, ...
evaluator.scalar_metrics

In [None]:
evaluator.plot_confusion_matrix()

In [None]:
evaluator.plot_roc_curve()

In [None]:
evaluator.plot_cumulative_gains()

In [None]:
evaluator.plot_lift_curve()

In [None]:
# evaluator.plot_cumulative_response_curve()

# MODEL USAGE

## PIG tables
Predictor Insight Graphs, or PIGs, are plots which help us profile how each variable behaves in the model.

### General structure

##### Generate PIG tables
`pig_tables = generate_pig_tables(basetable)`

##### Plot PIG tables
`plot_incidence(pig_tables)`

In [None]:
predictor_list = [col for col in basetable.columns if col.endswith('_bin') or col.endswith('_processed')]

pig_tables = generate_pig_tables(
    basetable[basetable['split']=='selection'],
    id_column_name='ID',
    target_column_name=target_column_name,
    preprocessed_predictors=predictor_list)

In [None]:
column_age_order = ['17.0 - 22.0', '22.0 - 26.0', '26.0 - 30.0', '30.0 - 33.0',
                    '33.0 - 37.0', '37.0 - 41.0', '41.0 - 45.0', '45.0 - 50.0',
                    '50.0 - 58.0', '58.0 - 90.0']              

plot_incidence(pig_tables, 'age', column_age_order)

In [None]:
pig_tables["variable"].unique()

In [None]:
column_hpw_order = ['1.0 - 24.0', '24.0 - 35.0', '35.0 - 40.0', 
                    '40.0 - 49.0', '49.0 - 55.0', '55.0 - 99.0']  

plot_incidence(pig_tables, 'hours-per-week', column_hpw_order)

In [None]:
plot_incidence(pig_tables, 'education')

In [None]:
plot_incidence(pig_tables, 'relationship')

In [None]:
pig_tables.head(n=10)

## Industrialization
Once we are happy with our model, we can industrialize it. All the preprocessing is in the output pipeline in a JSON format. The model comes from scikit-learn, which can be easily serialized (= saved) and exported.

In [None]:
with open(ROOT/'pipeline.json', "r") as read_file:
    pipeline = json.load(read_file)

print(pipeline.keys())

In [None]:
pipeline['target_encoder']['_mapping']['age_bin']