# 10 Why Hat
> Analytical approach using Neural Network on tabulate data

The core engine for project $\large\hat{y}$.

The dataset is from [New York City Airbnb Open Data](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data#AB_NYC_2019.csv) on kaggle

In [6]:
# default_exp whyhat

In [7]:
# export
import pandas as pd
import numpy as np
from pathlib import Path
import os
import json
from torchember.core import color
from torchember.helper import tracker

In [8]:
DATA = Path("../data")

In [9]:
CSV_PATH = DATA/"AB_NYC_2019.csv"

The AirBnB New York 2019 dataset

In [10]:
df = pd.read_csv(CSV_PATH)
df.sample(10)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
39111,30528541,New york Multi-unit building,95575605,Natasha,Manhattan,Chelsea,40.74726,-73.99029,Entire home/apt,250,7,0,,,1,0
22344,18041862,Bushwick's Private Modern Space,124142417,Marlene,Brooklyn,Bushwick,40.68824,-73.91567,Entire home/apt,100,2,111,2019-06-18,4.12,2,10
5780,4216435,Cozy hideaway,1361993,Daniel,Bronx,Kingsbridge,40.88399,-73.89502,Entire home/apt,110,2,0,,,1,0
31986,24967000,Gorgeous apartment with access to all #2,175116422,Karilyn,Staten Island,Grant City,40.57856,-74.1107,Entire home/apt,72,3,11,2019-03-24,1.05,3,333
29281,22464295,Lower East Side Newly renovated! #7,158969505,Karen,Manhattan,Lower East Side,40.72268,-73.99205,Entire home/apt,250,30,3,2019-04-30,0.37,9,264
17329,13685787,Cozy room with Private porch,69977115,Jacob,Brooklyn,Bensonhurst,40.61649,-73.99043,Private room,79,2,1,2016-07-18,0.03,4,179
37173,29540687,"East 82nd Street, Upper East Side/Yorkville 1bd",22541573,Ken,Manhattan,Upper East Side,40.77327,-73.94939,Entire home/apt,165,30,0,,,87,355
13605,10175678,Townhouse in Williamsburg,10590489,Caroline,Brooklyn,Williamsburg,40.71367,-73.94228,Private room,60,2,7,2017-05-11,0.17,1,0
40182,31172584,Williamsburg Duplex Loft of Lorimer L,4318363,D,Brooklyn,Williamsburg,40.71462,-73.95278,Entire home/apt,180,3,2,2019-02-18,0.41,2,24
6911,4946624,Stylish Apartment in post-war walk-up Building,25494876,Vivienne,Manhattan,Lower East Side,40.71959,-73.9919,Entire home/apt,149,2,14,2018-09-09,0.28,1,0


### Config how we learn the columns

This is a python/console interface, that will 
* guide the user through columns one by one, 
* let user decide how should we treat a column during the learning

In [58]:
# export
from hashlib import md5
from datetime import datetime
def md5hash(x):
    return md5(x.encode()).hexdigest()

class RichColumn(object):
    """
    A pandas series manager
    """
    def __init__(self,column, is_y = False,min_occur = 5, is_emb = True,hidden_size=50):
        self.col = column
        self.col.rc = self
        self.name = self.col.name
        self.min_occur = min_occur
        self.hidden_size = hidden_size
        self.is_emb =  is_emb
        self.is_y = is_y
        self.use = True
        self.is_conti = True
        self.defined = False
        
    def kill(self):
        """
        set column to kill mode, that it would not be involved in the learning
        """
        self.defined = True
        self.use = False
        
    def conti(self):
        """
        set column to contineous data
        """
        self.defined = True
        self.is_conti = True
        
    def disc(self):
        """
        set column to discrete data
        """
        self.defined = True
        self.is_conti = False
        
    def is_number(self):
        """
        Is this column's data type in any form of number
        """
        return self.col.dtype in (int,float,
                              np.float16,np.float32,np.float64,np.float64,
                              np.int0,np.int8,np.int16,np.int32,np.int64)
    
    def __bool__(self):
        """
        is this column going to join the learning
        """
        return self.use
    
    def __len__(self):
        """
        width of column when entering the model, or used as target
        """
        if self.is_conti:
            return 1
        else:
            if self.is_emb:
                return self.hidden_size
            else:
                width = len(self.top_freq)
                width =1 if width==2 else width
                return width
    
    def __repr__(self,):
        return f"<Rich Column:{self.name}>"
    
    def top_freq_(self):
        freq = self.freq()
        self.top_freq = freq[freq[self.name]>=self.min_occur].reset_index()
        return self.top_freq
    
    def freq(self):
        return pd.DataFrame(data=self.col.value_counts())
    
    @property
    def conf_dict(self):
        return dict((i,getattr(self,i)) for i in ["name","defined","is_conti","is_y","is_emb","use"])
    
    def set_conf(self,conf_dict):
        for k,v in conf_dict.items():
            setattr(self,k,v)
        return self
    
class RichDF(object):
    """
    A pandas dataframe manager
    """
    def __init__(self,df,fname=None):
        self.df = df
        self.columns = dict()
        if fname==None:
            fname=f"why_hat_{self.ts_str}"
        self.t = tracker("torchember",fname)
        self.t.data = self.t.log_path
        for colname in self.df:
            self.columns.update({colname:RichColumn(df[colname])})
            
    @property
    def ts_str(self):
        return datetime.now().strftime("%m%d_%H%M%S")
        
    @property
    def col_conf(self):
        return dict((k,{"use":v.use,"is_cont":v.is_conti}) for k,v in self.columns.items())
        
    def kill(self,colname):
        self.df[colname].rc.kill()
        
    def conti(self,colname):
        self.df[colname].rc.conti()
        
    def disc(self,colname):
        self.df[colname].rc.disc()
        
    def save_col(self,rcol):
        self.t[md5hash(rcol.name)]=rcol.conf_dict
        
    def set_col(self,rcol):
        if rcol.defined:
            print(f"{rcol.name} defined, use:{rcol.use}, contineus?:{rcol.is_conti}")
        print(color.bold("="*30))
        print(color.cyan(rcol.name))
        print(color.red(f"number? {rcol.is_number()}"))
        print(rcol.top_freq_().head(5))
              
        print(color.red("Is this a [C]ontineous, [D]iscrete or a column we do[N]'t need? default N"))
        x = input().lower()
        if x=="c":
            rcol.conti()
            print(color.blue(f"{rcol.name} set to contineous data"))
            self.save_col(rcol)
        elif x =="d":
            rcol.disc()
            print(color.blue(f"{rcol.name} set to discrite data"))
            self.save_col(rcol)
        elif (x =="") or (x=="n"):
            rcol.kill()
            print(color.blue(f"{rcol.name} will not be involved in learning"))
            self.save_col(rcol)
        else:
            print(color.yellow(f"option [{x}] not found, try Again?"))
            
    def save(self,colname):
        col=self.df[colname]
        self.t[md5hash(colname)] = col.rc.conf_dict
        
    def read(self,colname):
        col=self.df[colname]
        col.rc.set_conf(self.t[md5hash(colname)])
        
    def tour(self):
        """
        Go through column 1 by 1 to decide the processing for its data
        """
        for colname in self.df:
            col = self.df[colname]
            current = self.t[md5hash(colname)]
            if current != None:
                col.rc.set_conf(current)
            if col.rc.defined==False:
                self.set_col(col.rc)
                
    def set_y(self, *colnames):
        """
        set columns to y
        all the columns that use==True and is_y==False will be treated as x
        """
        for colname in colnames:
            rc = self.columns[colname]
            rc.is_y = True
            rc.use = True
            self.save(colname)
            
    def set_x(self, *colnames):
        """
        set columns to x
        of course,every columns' default status is x, 
        so you don't have to set this if you accidentally set x to y
        """
        for colname in colnames:
            rc = self.columns[colname]
            rc.use = True
            rc.is_y = False
            self.save(colname)
    
    @property
    def Xs(self):
        for col in self.df:
            if self.df[col].rc.is_y ==False:
                yield self.df[col].rc
    
    @property
    def Ys(self):
        for col in self.df:
            if self.df[col].rc.is_y :
                yield self.df[col].rc

In [59]:
rdf = RichDF(df,fname = "testing_case_nyc")

## Use tour() to set the configuration

In [60]:
rdf.tour()

Here's how I set the columns

In [61]:
rdf.set_y("price")

This is how I set the configuration:

In [62]:
for col in rdf.df:
    print(">"*5,col,"<"*5)
    print(rdf.t[md5hash(col)])

>>>>> id <<<<<
{'name': 'id', 'defined': True, 'is_conti': True, 'is_y': False, 'is_emb': True, 'use': False}
>>>>> name <<<<<
{'name': 'name', 'defined': True, 'is_conti': False, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> host_id <<<<<
{'name': 'host_id', 'defined': True, 'is_conti': False, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> host_name <<<<<
{'name': 'host_name', 'defined': True, 'is_conti': True, 'is_y': False, 'is_emb': True, 'use': False}
>>>>> neighbourhood_group <<<<<
{'name': 'neighbourhood_group', 'defined': True, 'is_conti': False, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> neighbourhood <<<<<
{'name': 'neighbourhood', 'defined': True, 'is_conti': False, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> latitude <<<<<
{'name': 'latitude', 'defined': True, 'is_conti': True, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> longitude <<<<<
{'name': 'longitude', 'defined': True, 'is_conti': True, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> room_t

In [63]:
list(rdf.Xs)

[<Rich Column:id>,
 <Rich Column:name>,
 <Rich Column:host_id>,
 <Rich Column:host_name>,
 <Rich Column:neighbourhood_group>,
 <Rich Column:neighbourhood>,
 <Rich Column:latitude>,
 <Rich Column:longitude>,
 <Rich Column:room_type>,
 <Rich Column:minimum_nights>,
 <Rich Column:number_of_reviews>,
 <Rich Column:last_review>,
 <Rich Column:reviews_per_month>,
 <Rich Column:calculated_host_listings_count>,
 <Rich Column:availability_365>]

In [65]:
list(rdf.Ys)

[<Rich Column:price>]

In [67]:
# export
class TabularNN:
    def __init__(self, rich_df):
        self.rich_df = rich_df
        self.x = list(self.rich_df.Xs)
        self.y = list(self.rich_df.Ys)
        self.assert_xy()
        
    def __repr__(self):
        return f">>TabularNN"
        
    def assert_xy(self):
        assert len(self.x)>0, "You have you set some X"
        assert len(self.y)>0, "You have you set some Y"
        
    def build_x_conti(self):
        return 

In [69]:
TabularNN(rdf)

<__main__.TabularNN at 0x132745890>