In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft

# Feature Tools

In [132]:
data = ft.demo.load_mock_customer()

In [133]:
data

{'customers':    customer_id zip_code           join_date date_of_birth
 0            1    60091 2011-04-17 10:48:33    1994-07-18
 1            2    13244 2012-04-15 23:31:04    1986-08-18
 2            3    13244 2011-08-13 15:42:34    2003-11-21
 3            4    60091 2011-04-08 20:08:14    2006-08-15
 4            5    60091 2010-07-17 05:27:50    1984-07-28,
 'sessions':     session_id  customer_id   device       session_start
 0            1            2  desktop 2014-01-01 00:00:00
 1            2            5   mobile 2014-01-01 00:17:20
 2            3            4   mobile 2014-01-01 00:28:10
 3            4            1   mobile 2014-01-01 00:44:25
 4            5            4   mobile 2014-01-01 01:11:30
 5            6            1   tablet 2014-01-01 01:23:25
 6            7            3   tablet 2014-01-01 01:39:40
 7            8            4   tablet 2014-01-01 01:55:55
 8            9            1  desktop 2014-01-01 02:15:25
 9           10            2   tablet 20

In [134]:
customers_df = data["customers"]

In [135]:
customers_df

Unnamed: 0,customer_id,zip_code,join_date,date_of_birth
0,1,60091,2011-04-17 10:48:33,1994-07-18
1,2,13244,2012-04-15 23:31:04,1986-08-18
2,3,13244,2011-08-13 15:42:34,2003-11-21
3,4,60091,2011-04-08 20:08:14,2006-08-15
4,5,60091,2010-07-17 05:27:50,1984-07-28


In [136]:
sessions_df = data["sessions"]

In [138]:
sessions_df.sample(5)

Unnamed: 0,session_id,customer_id,device,session_start
13,14,1,tablet,2014-01-01 03:28:00
6,7,3,tablet,2014-01-01 01:39:40
1,2,5,mobile,2014-01-01 00:17:20
28,29,1,mobile,2014-01-01 07:10:05
24,25,3,desktop,2014-01-01 05:59:40


In [139]:
transactions_df = data["transactions"]

In [140]:
transactions_df.sample(3)

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount
74,232,5,2014-01-01 01:20:10,1,139.2
231,27,17,2014-01-01 04:10:15,2,90.79
434,36,31,2014-01-01 07:50:10,3,62.35


In [141]:
entities = {
    "customers" : (customers_df, "customer_id"),
    "sessions" : (sessions_df, "session_id", "session_start"),
    "transactions" : (transactions_df, "transaction_id", "transaction_time")
}

In [None]:
# List of relationships between entities formatted:
# (parent_entity, parent_variable, child_entity, child_variable)

In [142]:
relationships = [
    ("sessions", "session_id", "transactions", "session_id"),
    ("customers", "customer_id", "sessions", "customer_id")
]

In [143]:
# the bare minimum for dfs is a list of entities, relationships and target entity

In [144]:
feature_matrix_customers, features_defs = ft.dfs(entities=entities,
                                                 relationships=relationships,
                                                 target_entity="customers")

In [145]:
feature_matrix_customers

Unnamed: 0_level_0,zip_code,COUNT(sessions),MODE(sessions.device),NUM_UNIQUE(sessions.device),COUNT(transactions),MAX(transactions.amount),MEAN(transactions.amount),MIN(transactions.amount),MODE(transactions.product_id),NUM_UNIQUE(transactions.product_id),SKEW(transactions.amount),STD(transactions.amount),SUM(transactions.amount),DAY(date_of_birth),DAY(join_date),MONTH(date_of_birth),MONTH(join_date),WEEKDAY(date_of_birth),WEEKDAY(join_date),YEAR(date_of_birth),YEAR(join_date),MAX(sessions.COUNT(transactions)),MAX(sessions.MEAN(transactions.amount)),MAX(sessions.MIN(transactions.amount)),MAX(sessions.NUM_UNIQUE(transactions.product_id)),MAX(sessions.SKEW(transactions.amount)),MAX(sessions.STD(transactions.amount)),MAX(sessions.SUM(transactions.amount)),MEAN(sessions.COUNT(transactions)),MEAN(sessions.MAX(transactions.amount)),MEAN(sessions.MEAN(transactions.amount)),MEAN(sessions.MIN(transactions.amount)),MEAN(sessions.NUM_UNIQUE(transactions.product_id)),MEAN(sessions.SKEW(transactions.amount)),MEAN(sessions.STD(transactions.amount)),MEAN(sessions.SUM(transactions.amount)),MIN(sessions.COUNT(transactions)),MIN(sessions.MAX(transactions.amount)),MIN(sessions.MEAN(transactions.amount)),MIN(sessions.NUM_UNIQUE(transactions.product_id)),MIN(sessions.SKEW(transactions.amount)),MIN(sessions.STD(transactions.amount)),MIN(sessions.SUM(transactions.amount)),MODE(sessions.DAY(session_start)),MODE(sessions.MODE(transactions.product_id)),MODE(sessions.MONTH(session_start)),MODE(sessions.WEEKDAY(session_start)),MODE(sessions.YEAR(session_start)),NUM_UNIQUE(sessions.DAY(session_start)),NUM_UNIQUE(sessions.MODE(transactions.product_id)),NUM_UNIQUE(sessions.MONTH(session_start)),NUM_UNIQUE(sessions.WEEKDAY(session_start)),NUM_UNIQUE(sessions.YEAR(session_start)),SKEW(sessions.COUNT(transactions)),SKEW(sessions.MAX(transactions.amount)),SKEW(sessions.MEAN(transactions.amount)),SKEW(sessions.MIN(transactions.amount)),SKEW(sessions.NUM_UNIQUE(transactions.product_id)),SKEW(sessions.STD(transactions.amount)),SKEW(sessions.SUM(transactions.amount)),STD(sessions.COUNT(transactions)),STD(sessions.MAX(transactions.amount)),STD(sessions.MEAN(transactions.amount)),STD(sessions.MIN(transactions.amount)),STD(sessions.NUM_UNIQUE(transactions.product_id)),STD(sessions.SKEW(transactions.amount)),STD(sessions.SUM(transactions.amount)),SUM(sessions.MAX(transactions.amount)),SUM(sessions.MEAN(transactions.amount)),SUM(sessions.MIN(transactions.amount)),SUM(sessions.NUM_UNIQUE(transactions.product_id)),SUM(sessions.SKEW(transactions.amount)),SUM(sessions.STD(transactions.amount)),MODE(transactions.sessions.customer_id),MODE(transactions.sessions.device),NUM_UNIQUE(transactions.sessions.customer_id),NUM_UNIQUE(transactions.sessions.device)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1
1,60091,8,mobile,3,126,139.43,71.631905,5.81,4,5,0.019698,40.442059,9025.62,18,17,7,4,0,6,1994,2011,25,88.755625,26.36,5,0.640252,46.905665,1613.93,15.75,132.24625,72.77414,9.82375,5.0,-0.059515,39.093244,1128.2025,12,118.9,50.623125,5,-1.038434,30.450261,809.97,1,4,1,2,2014,1,4,1,1,1,1.946018,-0.780493,-0.424949,2.440005,0.0,-0.312355,0.77817,4.062019,7.322191,13.759314,6.954507,0.0,0.589386,279.510713,1057.97,582.193117,78.59,40,-0.476122,312.745952,1,mobile,1,3
2,13244,7,desktop,3,93,146.81,77.422366,8.73,4,5,0.098259,37.705178,7200.28,18,15,8,4,0,6,1986,2012,18,96.581,56.46,5,0.755711,47.93592,1320.64,13.285714,133.09,78.415122,22.085714,5.0,-0.039663,36.957218,1028.611429,8,100.04,61.91,5,-0.763603,27.839228,634.84,1,3,1,2,2014,1,4,1,1,1,-0.303276,-1.539467,0.235296,2.154929,0.0,0.013087,-0.440929,3.450328,17.221593,11.477071,15.874374,0.0,0.509798,251.609234,931.63,548.905851,154.6,35,-0.27764,258.700528,2,desktop,1,3
3,13244,6,desktop,3,93,149.15,67.06043,5.89,1,5,0.41823,43.683296,6236.62,21,13,11,8,4,5,2003,2011,18,82.109444,20.06,5,0.854976,50.11012,1477.97,15.5,141.271667,67.539577,11.035,4.833333,0.381014,42.883316,1039.436667,11,126.74,55.579412,4,-0.289466,35.70468,889.21,1,1,1,2,2014,1,4,1,1,1,-1.507217,-0.941078,0.678544,1.000771,-2.44949,-0.245703,2.246479,2.428992,10.724241,11.174282,5.424407,0.408248,0.429374,219.02142,847.63,405.237462,66.21,29,2.286086,257.299895,3,desktop,1,3
4,60091,8,mobile,3,109,149.95,80.070459,5.73,2,5,-0.036348,45.068765,8727.68,15,8,8,4,1,4,2006,2011,18,110.45,54.83,5,0.382868,54.293903,1351.46,13.625,144.74875,81.207189,16.43875,4.625,0.000346,44.515729,1090.96,10,139.2,70.638182,4,-0.711744,29.026424,771.68,1,1,1,2,2014,1,5,1,1,1,0.282488,0.027256,1.980948,2.10351,-0.644061,-1.065663,-0.391805,3.335416,3.514421,13.027258,16.960575,0.517549,0.387884,235.992478,1157.99,649.657515,131.51,37,0.002764,356.125829,4,mobile,1,3
5,60091,6,mobile,3,79,149.02,80.375443,7.55,5,5,-0.025941,44.09563,6349.66,28,17,7,7,5,5,1984,2010,18,94.481667,20.65,5,0.602209,51.14925,1700.67,13.166667,139.96,78.705187,14.415,5.0,0.002397,43.312326,1058.276667,8,128.51,66.666667,5,-0.53906,36.734681,543.18,1,3,1,2,2014,1,5,1,1,1,-0.317685,-0.333796,0.335175,-0.47041,0.0,0.204548,0.472342,3.600926,7.928001,11.007471,4.961414,0.0,0.415426,402.775486,839.76,472.231119,86.49,30,0.014384,259.873954,5,mobile,1,3


In [3]:
tuition_fees_dict = {
    "GASB" : [np.nan, 10000, 20000, np.nan, 10500, np.nan, np.nan],
    "FASB": [np.nan, np.nan, np.nan, 20750, np.nan, np.nan, 10000],
    "Public": [25000, np.nan, np.nan, np.nan, np.nan, np.nan, 10500],
}

In [4]:
df = pd.DataFrame(tuition_fees_dict)

In [5]:
df.reset_index(inplace=True)

In [6]:
df

Unnamed: 0,index,GASB,FASB,Public
0,0,,,25000.0
1,1,10000.0,,
2,2,20000.0,,
3,3,,20750.0,
4,4,10500.0,,
5,5,,,
6,6,,10000.0,10500.0


In [7]:

#test for overlaping numbers

#replace all numbers with 1

#sum across rows

In [8]:
#drop Na values
df_1 = df[["index","GASB"]].dropna()
df_2 = df[["index", "FASB"]].dropna()
df_3 = df[["index", "Public"]].dropna()

df_1.columns = ["index", "tuition_fees"]
df_2.columns = ["index", "tuition_fees"]
df_3.columns = ["index", "tuition_fees"]

#concatenate all dataframes
df_total = pd.concat([df_1,df_2,df_3])
#remove duplicate values
df_total.drop_duplicates(subset="index", inplace=True)
#df_total = df_total.merge(df_2, on="index", how="outer")


In [9]:
percentages = { 
    "Total_Revenue": [20000,50000, 25000, 75000, 15000],
    "Tuition_Fees": [.10, .30, .50, .40, .10],
    "Other_Revenue": [0.9, 0.7, 0.5, 0.6, 0.9]
}

In [10]:
percent_df = pd.DataFrame(percentages)

In [11]:
percent_df

Unnamed: 0,Total_Revenue,Tuition_Fees,Other_Revenue
0,20000,0.1,0.9
1,50000,0.3,0.7
2,25000,0.5,0.5
3,75000,0.4,0.6
4,15000,0.1,0.9


In [12]:
percent_df["Total_Revenue"] * percent_df["Tuition_Fees"]

0     2000.0
1    15000.0
2    12500.0
3    30000.0
4     1500.0
dtype: float64

### Comparing Distributions

In [19]:
np.random.uniform(low=0.0, high=1.0, size=100).std()

0.28731475526324307

In [20]:
from scipy.stats import ks_2samp

In [42]:
x = np.random.normal(0, 1, 1000)
y = np.random.normal(0, 1, 1000)
z = np.random.normal(1.1, 0.9, 1000)

In [43]:
ks_2samp(x, y, alternative='two-sided')

KstestResult(statistic=0.0026599999999999957, pvalue=0.8701361499165012)

In [35]:
ks_2samp(y, z, alternative='two-sided')

KstestResult(statistic=0.434, pvalue=0.0)

In [85]:
uniform = [100/6, 100/6, 100/6, 100/6, 100/6, 100/6]
data1 = np.array([0.01, 0, 0, 0, 0, 0.99])
data2 = np.array([0.1, 0.2, 0.3, 0.1, 0.2, 0.1])

In [86]:
data1.std()

0.36822396566341103

In [87]:
data2.std()

0.07453559924999299

In [88]:
ks_2samp(uniform, data1, mode='exact').pvalue

0.0021645021645021645

In [89]:
ks_2samp(uniform, data2, mode='exact')

KstestResult(statistic=1.0, pvalue=0.0021645021645021645)

### IPython Practice

In [39]:
import ipywidgets as widgets
from IPython.display import YouTubeVideo, display
import logging

### debuggin tip: use out.capture

In [37]:
debug_view = widgets.Output(layout={'border': '1px solid black'})

@debug_view.capture(clear_output=True)
def bad_callback(event):
    print('This is about to explode')
    return 1.0 / 0.0

button = widgets.Button(
    description='click me to raise an exception',
    layout={'width': '300px'}
)
button.on_click(bad_callback)
button

Button(description='click me to raise an exception', layout=Layout(width='300px'), style=ButtonStyle())

In [38]:
debug_view

Output(layout=Layout(border='1px solid black'))

In [40]:
class OutputWidgetHandler(logging.Handler):
    """ Custom logging handler sending logs to an output widget """

    def __init__(self, *args, **kwargs):
        super(OutputWidgetHandler, self).__init__(*args, **kwargs)
        layout = {
            'width': '100%',
            'height': '160px',
            'border': '1px solid black'
        }
        self.out = widgets.Output(layout=layout)

    def emit(self, record):
        """ Overload of logging.Handler method """
        formatted_record = self.format(record)
        new_output = {
            'name': 'stdout',
            'output_type': 'stream',
            'text': formatted_record+'\n'
        }
        self.out.outputs = (new_output, ) + self.out.outputs

    def show_logs(self):
        """ Show the logs """
        display(self.out)

    def clear_logs(self):
        """ Clear the current logs """
        self.out.clear_output()


In [41]:
logger = logging.getLogger(__name__)
handler = OutputWidgetHandler()
handler.setFormatter(logging.Formatter('%(asctime)s  - [%(levelname)s] %(message)s'))
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [44]:
handler.show_logs()

Output(layout=Layout(border='1px solid black', height='160px', width='100%'), outputs=({'name': 'stdout', 'out…

In [45]:
handler.clear_logs()
logger.info('Starting program')

try:
    logger.info('About to try something dangerous...')
    1.0/0.0
except Exception as e:
    logger.exception('An error occurred!')

In [46]:
handler.clear_logs()

### Buttons

In [108]:
N = 0

In [109]:
button = widgets.Button(description="MyButton")

In [110]:
output = widgets.Output()

In [111]:
@output.capture(clear_output=True)
def button_clicked(text):
    global N
    N += 1
    with output:
        print(f"My Button Clicked {N} timmes")

In [112]:
button.on_click(button_clicked)

In [116]:
VBox_output = widgets.VBox([button, output])

In [117]:
display(VBox_output)

VBox(children=(Button(description='MyButton', style=ButtonStyle()), Output(outputs=({'output_type': 'stream', …

In [83]:
output.clear_output(wait=True)