# 10 Why Hat
> Analytical approach using Neural Network on tabulate data

The core engine for project $\large\hat{y}$.

The dataset is from [New York City Airbnb Open Data](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data#AB_NYC_2019.csv) on kaggle

In [4]:
# default_exp whyhat

In [57]:
# export
import pandas as pd
import numpy as np
from pathlib import Path
import os
import json
from torchember.core import color
from torchember.helper import tracker

In [58]:
DATA = Path("../data")

In [59]:
CSV_PATH = DATA/"AB_NYC_2019.csv"

The AirBnB New York 2019 dataset

In [60]:
df = pd.read_csv(CSV_PATH)
df.sample(10)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
12891,9819589,Designer's Studio with Cats +Yard,2290932,Jack,Brooklyn,Fort Greene,40.68579,-73.97455,Entire home/apt,80,2,4,2018-05-27,0.17,1,0
13433,10068693,Cozy Mozy apt in Cobble Hill,16591072,Marianna,Brooklyn,Boerum Hill,40.68869,-73.98899,Entire home/apt,168,1,1,2016-01-05,0.02,1,0
9720,7475079,Quiet bright clean and convenient in 2 Bdroom ...,39148503,Ramiro,Brooklyn,Williamsburg,40.70891,-73.94161,Private room,70,3,8,2019-01-01,0.17,2,0
28287,21996983,Modern Plush Studio Apt near Times Square in NYC!,95459395,Bluebird,Manhattan,Hell's Kitchen,40.76084,-73.99899,Entire home/apt,299,30,0,,,18,365
23530,19050995,2 bedroom home in BedStuy 15 minutes to Manhattan,56081679,José,Brooklyn,Bedford-Stuyvesant,40.6896,-73.9266,Private room,200,2,2,2017-06-20,0.08,1,0
6243,4563401,MIDTOWN APARTMENT WITH SPECTACULAR VIEW,22885233,Peter,Manhattan,Hell's Kitchen,40.76575,-73.99241,Entire home/apt,215,6,39,2019-06-01,1.57,1,24
22806,18462718,Neat Cozy room attached by PRIVATE FULL BATHROOM,120767920,Jimmy &Cindy,Queens,Flushing,40.75556,-73.81205,Private room,47,2,79,2019-06-18,3.15,10,346
22578,18265387,Beautiful Room with Private Bathroom by Bushwi...,99608108,Osa,Brooklyn,Bushwick,40.70464,-73.91556,Private room,69,2,23,2019-06-03,0.89,3,145
14668,11587997,"Quiet, Clean, Lit @ Lower East Side",40470433,Hana,Manhattan,Lower East Side,40.71303,-73.98645,Private room,52,1,0,,,1,0
28906,22280482,HUGE private bedroom in Artists' home in Bushwick,43756609,Karina,Brooklyn,Bushwick,40.68847,-73.913,Private room,70,1,3,2018-01-01,0.16,2,0


### Config how we learn the columns

This is a python/console interface, that will 
* guide the user through columns one by one, 
* let user decide how should we treat a column during the learning

In [61]:
# export
from hashlib import md5
from datetime import datetime
from torch import nn
def md5hash(x):
    return md5(x.encode()).hexdigest()

class InputEmb(nn.Module):
    def __init__(self,rich_col):
        super().__init__()
        self.rich_col = rich_col
        self.emb = nn.Embedding(len(rich_col)+1,rich_col.hidden_size)
        
    def forward(self,x)：
        return self.emb(x)
    
class InputOneHot(nn.Module):
    def __init__(self,rich_col):
        super().__init__()
        self.rich_col = rich_col
        self.eye = torch.eye(len(self.rich_col))
        
    def forward(self,x)：
        return self.eye[x]
    
class InputConti(nn.Module):
    def __init__(self,rich_col):
        super().__init__()
        self.rich_col = rich_col
        self.bn=nn.BatchNorm1d(1)
        self.tanh = nn.Tanh()
        
    def forward(self,x):
        return self.tanh(self.bn(x))
        
class RichColumn(object):
    """
    A pandas series manager
    """
    def __init__(self,column, is_y = False,min_occur = 5, is_emb = True,hidden_size=50):
        self.col = column
        self.col.rc = self
        self.name = self.col.name
        self.min_occur = min_occur
        self.hidden_size = hidden_size
        self.is_emb =  is_emb
        self.is_y = is_y
        self.use = True
        self.is_conti = True
        self.defined = False
        
    def kill(self):
        """
        set column to kill mode, that it would not be involved in the learning
        """
        self.defined = True
        self.use = False
        
    def conti(self):
        """
        set column to contineous data
        """
        self.defined = True
        self.is_conti = True
        
    def disc(self):
        """
        set column to discrete data
        """
        self.defined = True
        self.is_conti = False
        
    def is_number(self):
        """
        Is this column's data type in any form of number
        """
        return self.col.dtype in (int,float,
                              np.float16,np.float32,np.float64,np.float64,
                              np.int0,np.int8,np.int16,np.int32,np.int64)
    
    def __bool__(self):
        """
        is this column going to join the learning
        """
        return self.use
    
    def __len__(self):
        """
        width of column when entering the model, or used as target
        """
        if self.is_conti:
            return 1
        else:
            if self.is_emb:
                return self.hidden_size
            else:
                width = len(self.top_freq)
                width =1 if width==2 else width
                return width
    
    def __repr__(self,):
        return f"<Rich Column:{self.name}>"
    
    def top_freq_(self):
        freq = self.freq()
        self.top_freq = freq[freq[self.name]>=self.min_occur].reset_index()
        self.tokens = dict((v,k+1) for k,v in enumerate(self.top_freq["index"]))
        self.token_arr = np.array(["<mtk>",]+list(self.top_freq["index"]))
        return self.top_freq
    
    def freq(self):
        return pd.DataFrame(data=self.col.value_counts())
    
    @property
    def conf_dict(self):
        return dict((i,getattr(self,i)) for i in ["name","defined","is_conti","is_y","is_emb","use"])
    
    def set_conf(self,conf_dict):
        for k,v in conf_dict.items():
            setattr(self,k,v)
        return self
    
    def encode(self,x):
        if self.is_conti:
            return x
        else:
            try:
                return self.tokens[x]
            except:
                return 0
        
    def decode(self,idx):
        return self.token_arr[idx]

SyntaxError: invalid character in identifier (<ipython-input-61-7c6b779db231>, line 14)

In [62]:
# export 
class RichDF(object):
    """
    A pandas dataframe manager
    """
    def __init__(self,df,fname=None):
        self.df = df
        self.columns = dict()
        if fname==None:
            fname=f"why_hat_{self.ts_str}"
        self.t = tracker("torchember",fname)
        self.t.data = self.t.log_path
        for colname in self.df:
            self.columns.update({colname:RichColumn(df[colname])})
            
    @property
    def ts_str(self):
        return datetime.now().strftime("%m%d_%H%M%S")
        
    @property
    def col_conf(self):
        return dict((k,{"use":v.use,"is_cont":v.is_conti}) for k,v in self.columns.items())
        
    def kill(self,colname):
        self.df[colname].rc.kill()
        
    def conti(self,colname):
        self.df[colname].rc.conti()
        
    def disc(self,colname):
        self.df[colname].rc.disc()
        
    def save_col(self,rcol):
        self.t[md5hash(rcol.name)]=rcol.conf_dict
        
    def set_col(self,rcol):
        if rcol.defined:
            print(f"{rcol.name} defined, use:{rcol.use}, contineus?:{rcol.is_conti}")
        print(color.bold("="*30))
        print(color.cyan(rcol.name))
        print(color.red(f"number? {rcol.is_number()}"))
        print(rcol.top_freq_().head(5))
              
        print(color.red("Is this a [C]ontineous, [D]iscrete or a column we do[N]'t need? default N"))
        x = input().lower()
        if x=="c":
            rcol.conti()
            print(color.blue(f"{rcol.name} set to contineous data"))
            self.save_col(rcol)
        elif x =="d":
            rcol.disc()
            print(color.blue(f"{rcol.name} set to discrite data"))
            self.save_col(rcol)
        elif (x =="") or (x=="n"):
            rcol.kill()
            print(color.blue(f"{rcol.name} will not be involved in learning"))
            self.save_col(rcol)
        else:
            print(color.yellow(f"option [{x}] not found, try Again?"))
            
    def save(self,colname):
        col=self.df[colname]
        self.t[md5hash(colname)] = col.rc.conf_dict
        
    def read(self,colname):
        col=self.df[colname]
        col.rc.set_conf(self.t[md5hash(colname)])
        if col.rc.is_conti:
            col.rc.top_freq_()
        
    def tour(self):
        """
        Go through column 1 by 1 to decide the processing for its data
        """
        for colname in self.df:
            col = self.df[colname]
            current = self.t[md5hash(colname)]
            if current != None:
                col.rc.set_conf(current)
                if col.rc.is_conti==False:
                    col.rc.top_freq_()
            if col.rc.defined==False:
                self.set_col(col.rc)
                
    def set_y(self, *colnames):
        """
        set columns to y
        all the columns that use==True and is_y==False will be treated as x
        """
        for colname in colnames:
            rc = self.columns[colname]
            rc.is_y = True
            rc.use = True
            self.save(colname)
            
    def set_x(self, *colnames):
        """
        set columns to x
        of course,every columns' default status is x, 
        so you don't have to set this if you accidentally set x to y
        """
        for colname in colnames:
            rc = self.columns[colname]
            rc.use = True
            rc.is_y = False
            self.save(colname)
    
    @property
    def Xs(self):
        """
        Return the next x rich column
        """
        for col in self.df:
            if self.df[col].rc.is_y ==False:
                yield self.df[col].rc
    
    @property
    def Ys(self):
        """
        Return the next y rich column
        """
        for col in self.df:
            if self.df[col].rc.is_y :
                yield self.df[col].rc
                

In [63]:
rdf = RichDF(df,fname = "testing_case_nyc")

## Use tour() to set the configuration

In [64]:
rdf.tour()

Here's how I set the columns

In [65]:
rdf.set_y("price")

This is how I set the configuration:

In [66]:
for col in rdf.df:
    print(">"*5,col,"<"*5)
    print(rdf.t[md5hash(col)])

>>>>> id <<<<<
{'name': 'id', 'defined': True, 'is_conti': True, 'is_y': False, 'is_emb': True, 'use': False}
>>>>> name <<<<<
{'name': 'name', 'defined': True, 'is_conti': True, 'is_y': False, 'is_emb': True, 'use': False}
>>>>> host_id <<<<<
{'name': 'host_id', 'defined': True, 'is_conti': False, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> host_name <<<<<
{'name': 'host_name', 'defined': True, 'is_conti': False, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> neighbourhood_group <<<<<
{'name': 'neighbourhood_group', 'defined': True, 'is_conti': False, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> neighbourhood <<<<<
{'name': 'neighbourhood', 'defined': True, 'is_conti': False, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> latitude <<<<<
{'name': 'latitude', 'defined': True, 'is_conti': True, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> longitude <<<<<
{'name': 'longitude', 'defined': True, 'is_conti': True, 'is_y': False, 'is_emb': True, 'use': True}
>>>>> room_t

In [37]:
list(rdf.Xs)

[<Rich Column:id>,
 <Rich Column:name>,
 <Rich Column:host_id>,
 <Rich Column:host_name>,
 <Rich Column:neighbourhood_group>,
 <Rich Column:neighbourhood>,
 <Rich Column:latitude>,
 <Rich Column:longitude>,
 <Rich Column:room_type>,
 <Rich Column:minimum_nights>,
 <Rich Column:number_of_reviews>,
 <Rich Column:last_review>,
 <Rich Column:reviews_per_month>,
 <Rich Column:calculated_host_listings_count>,
 <Rich Column:availability_365>]

In [38]:
list(rdf.Ys)

[<Rich Column:price>]

In [39]:
# export
class TabularNN:
    def __init__(self, rich_df):
        self.rich_df = rich_df
        self.x = list(self.rich_df.Xs)
        self.y = list(self.rich_df.Ys)
        self.assert_xy()
        
    def __repr__(self):
        return f">>TabularNN"
        
    def assert_xy(self):
        assert len(self.x)>0, "You have you set some X"
        assert len(self.y)>0, "You have you set some Y"

In [40]:
tnn = TabularNN(rdf)

In [67]:
coldf = tnn.x[1].top_freq["index"]