In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft

# Feature Tools

In [132]:
data = ft.demo.load_mock_customer()

In [133]:
data

{'customers':    customer_id zip_code           join_date date_of_birth
 0            1    60091 2011-04-17 10:48:33    1994-07-18
 1            2    13244 2012-04-15 23:31:04    1986-08-18
 2            3    13244 2011-08-13 15:42:34    2003-11-21
 3            4    60091 2011-04-08 20:08:14    2006-08-15
 4            5    60091 2010-07-17 05:27:50    1984-07-28,
 'sessions':     session_id  customer_id   device       session_start
 0            1            2  desktop 2014-01-01 00:00:00
 1            2            5   mobile 2014-01-01 00:17:20
 2            3            4   mobile 2014-01-01 00:28:10
 3            4            1   mobile 2014-01-01 00:44:25
 4            5            4   mobile 2014-01-01 01:11:30
 5            6            1   tablet 2014-01-01 01:23:25
 6            7            3   tablet 2014-01-01 01:39:40
 7            8            4   tablet 2014-01-01 01:55:55
 8            9            1  desktop 2014-01-01 02:15:25
 9           10            2   tablet 20

In [134]:
customers_df = data["customers"]

In [135]:
customers_df

Unnamed: 0,customer_id,zip_code,join_date,date_of_birth
0,1,60091,2011-04-17 10:48:33,1994-07-18
1,2,13244,2012-04-15 23:31:04,1986-08-18
2,3,13244,2011-08-13 15:42:34,2003-11-21
3,4,60091,2011-04-08 20:08:14,2006-08-15
4,5,60091,2010-07-17 05:27:50,1984-07-28


In [136]:
sessions_df = data["sessions"]

In [138]:
sessions_df.sample(5)

Unnamed: 0,session_id,customer_id,device,session_start
13,14,1,tablet,2014-01-01 03:28:00
6,7,3,tablet,2014-01-01 01:39:40
1,2,5,mobile,2014-01-01 00:17:20
28,29,1,mobile,2014-01-01 07:10:05
24,25,3,desktop,2014-01-01 05:59:40


In [139]:
transactions_df = data["transactions"]

In [3]:
tuition_fees_dict = {
    "GASB" : [np.nan, 10000, 20000, np.nan, 10500, np.nan, np.nan],
    "FASB": [np.nan, np.nan, np.nan, 20750, np.nan, np.nan, 10000],
    "Public": [25000, np.nan, np.nan, np.nan, np.nan, np.nan, 10500],
}

In [4]:
df = pd.DataFrame(tuition_fees_dict)

In [5]:
df.reset_index(inplace=True)

In [6]:
df

Unnamed: 0,index,GASB,FASB,Public
0,0,,,25000.0
1,1,10000.0,,
2,2,20000.0,,
3,3,,20750.0,
4,4,10500.0,,
5,5,,,
6,6,,10000.0,10500.0


In [7]:

#test for overlaping numbers

#replace all numbers with 1

#sum across rows

In [8]:
#drop Na values
df_1 = df[["index","GASB"]].dropna()
df_2 = df[["index", "FASB"]].dropna()
df_3 = df[["index", "Public"]].dropna()

df_1.columns = ["index", "tuition_fees"]
df_2.columns = ["index", "tuition_fees"]
df_3.columns = ["index", "tuition_fees"]

#concatenate all dataframes
df_total = pd.concat([df_1,df_2,df_3])
#remove duplicate values
df_total.drop_duplicates(subset="index", inplace=True)
#df_total = df_total.merge(df_2, on="index", how="outer")


In [9]:
percentages = { 
    "Total_Revenue": [20000,50000, 25000, 75000, 15000],
    "Tuition_Fees": [.10, .30, .50, .40, .10],
    "Other_Revenue": [0.9, 0.7, 0.5, 0.6, 0.9]
}

In [10]:
percent_df = pd.DataFrame(percentages)

In [11]:
percent_df

Unnamed: 0,Total_Revenue,Tuition_Fees,Other_Revenue
0,20000,0.1,0.9
1,50000,0.3,0.7
2,25000,0.5,0.5
3,75000,0.4,0.6
4,15000,0.1,0.9


In [12]:
percent_df["Total_Revenue"] * percent_df["Tuition_Fees"]

0     2000.0
1    15000.0
2    12500.0
3    30000.0
4     1500.0
dtype: float64

### Comparing Distributions

In [19]:
np.random.uniform(low=0.0, high=1.0, size=100).std()

0.28731475526324307

In [20]:
from scipy.stats import ks_2samp

In [42]:
x = np.random.normal(0, 1, 1000)
y = np.random.normal(0, 1, 1000)
z = np.random.normal(1.1, 0.9, 1000)

In [43]:
ks_2samp(x, y, alternative='two-sided')

KstestResult(statistic=0.0026599999999999957, pvalue=0.8701361499165012)

In [35]:
ks_2samp(y, z, alternative='two-sided')

KstestResult(statistic=0.434, pvalue=0.0)

In [85]:
uniform = [100/6, 100/6, 100/6, 100/6, 100/6, 100/6]
data1 = np.array([0.01, 0, 0, 0, 0, 0.99])
data2 = np.array([0.1, 0.2, 0.3, 0.1, 0.2, 0.1])

In [86]:
data1.std()

0.36822396566341103

In [87]:
data2.std()

0.07453559924999299

In [88]:
ks_2samp(uniform, data1, mode='exact').pvalue

0.0021645021645021645

In [89]:
ks_2samp(uniform, data2, mode='exact')

KstestResult(statistic=1.0, pvalue=0.0021645021645021645)

### IPython Practice

In [39]:
import ipywidgets as widgets
from IPython.display import YouTubeVideo, display
import logging

### debuggin tip: use out.capture

In [37]:
debug_view = widgets.Output(layout={'border': '1px solid black'})

@debug_view.capture(clear_output=True)
def bad_callback(event):
    print('This is about to explode')
    return 1.0 / 0.0

button = widgets.Button(
    description='click me to raise an exception',
    layout={'width': '300px'}
)
button.on_click(bad_callback)
button

Button(description='click me to raise an exception', layout=Layout(width='300px'), style=ButtonStyle())

In [38]:
debug_view

Output(layout=Layout(border='1px solid black'))

In [40]:
class OutputWidgetHandler(logging.Handler):
    """ Custom logging handler sending logs to an output widget """

    def __init__(self, *args, **kwargs):
        super(OutputWidgetHandler, self).__init__(*args, **kwargs)
        layout = {
            'width': '100%',
            'height': '160px',
            'border': '1px solid black'
        }
        self.out = widgets.Output(layout=layout)

    def emit(self, record):
        """ Overload of logging.Handler method """
        formatted_record = self.format(record)
        new_output = {
            'name': 'stdout',
            'output_type': 'stream',
            'text': formatted_record+'\n'
        }
        self.out.outputs = (new_output, ) + self.out.outputs

    def show_logs(self):
        """ Show the logs """
        display(self.out)

    def clear_logs(self):
        """ Clear the current logs """
        self.out.clear_output()


In [41]:
logger = logging.getLogger(__name__)
handler = OutputWidgetHandler()
handler.setFormatter(logging.Formatter('%(asctime)s  - [%(levelname)s] %(message)s'))
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [44]:
handler.show_logs()

Output(layout=Layout(border='1px solid black', height='160px', width='100%'), outputs=({'name': 'stdout', 'out…

In [45]:
handler.clear_logs()
logger.info('Starting program')

try:
    logger.info('About to try something dangerous...')
    1.0/0.0
except Exception as e:
    logger.exception('An error occurred!')

In [46]:
handler.clear_logs()

### Buttons

In [108]:
N = 0

In [109]:
button = widgets.Button(description="MyButton")

In [110]:
output = widgets.Output()

In [111]:
@output.capture(clear_output=True)
def button_clicked(text):
    global N
    N += 1
    with output:
        print(f"My Button Clicked {N} timmes")

In [112]:
button.on_click(button_clicked)

In [116]:
VBox_output = widgets.VBox([button, output])

In [117]:
display(VBox_output)

VBox(children=(Button(description='MyButton', style=ButtonStyle()), Output(outputs=({'output_type': 'stream', …

In [83]:
output.clear_output(wait=True)