# 10 Why Hat
> Analytical approach using Neural Network on tabulate data

The core engine for project $\large\hat{y}$.

The dataset is from [New York City Airbnb Open Data](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data#AB_NYC_2019.csv) on kaggle

In [1]:
# default_exp whyhat

In [2]:
# export
import pandas as pd
import numpy as np
from pathlib import Path
import os
import json
from torchember.core import color
from torchember.helper import tracker

In [3]:
DATA = Path("../data")

In [4]:
CSV_PATH = DATA/"AB_NYC_2019.csv"

The AirBnB New York 2019 dataset

In [5]:
df = pd.read_csv(CSV_PATH)
df.sample(10)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
27254,21530562,Modern Cozy & Clean Private 3 Bedroom Mins to NYC,154088360,Kelly,Queens,Long Island City,40.75331,-73.93907,Entire home/apt,300,4,65,2019-07-01,3.27,2,221
26267,20941420,Room in Chelsea,15487773,Sorcha,Manhattan,Chelsea,40.74063,-73.99998,Private room,100,2,4,2017-10-30,0.19,1,0
34798,27582416,Luxury Private Condo/Apartment,69997531,Markus,Brooklyn,Crown Heights,40.67512,-73.94192,Private room,82,2,21,2019-06-30,1.94,1,175
37518,29776064,New York Home with a View,51338091,Vander,Manhattan,Harlem,40.82816,-73.93827,Entire home/apt,130,7,1,2019-01-06,0.16,1,280
35159,27891791,Comfort Away from Home,205188960,Patrick,Brooklyn,East Flatbush,40.6491,-73.92532,Entire home/apt,75,3,12,2019-06-30,1.21,1,167
7235,5298458,"Upper West SIDE,Washington bridge,Time Square 30'",2730883,Seb,Manhattan,Washington Heights,40.85341,-73.93181,Entire home/apt,37,30,3,2017-12-26,0.06,2,82
35357,28046851,Bellrose Home - 15 minutes from both NYC airp...,211860521,Tamara,Queens,Bellerose,40.72908,-73.72778,Private room,105,3,7,2019-05-31,0.76,2,343
21555,17261922,Townhouse apt. Close to city. 3 bedrooms & 2 bath,116365995,Dan,Queens,Woodside,40.74705,-73.90792,Entire home/apt,275,4,31,2019-07-04,1.21,2,283
32765,25812660,*Luxurious* Sparkling Clean Two Bedroom Apartment,183803880,Alisa,Manhattan,Hell's Kitchen,40.76578,-73.98383,Entire home/apt,499,3,26,2019-06-20,2.5,1,0
37366,29657375,"Gorgeous Room in Williamsburg, 10min to Soho",2967914,Nina,Brooklyn,Williamsburg,40.71304,-73.96238,Private room,102,1,48,2019-06-21,5.81,2,20


### Config how we learn the columns

This is a python/console interface, that will 
* guide the user through columns one by one, 
* let user decide how should we treat a column during the learning

In [6]:
# export
from hashlib import md5
from datetime import datetime
def md5hash(x):
    return md5(x.encode()).hexdigest()

class RichColumn(object):
    """
    A pandas series manager
    """
    def __init__(self,column, is_y = False,min_occur = 5, is_emb = True,hidden_size=50):
        self.col = column
        self.col.rc = self
        self.name = self.col.name
        self.min_occur = min_occur
        self.hidden_size = hidden_size
        self.is_emb =  is_emb
        self.is_y = is_y
        self.use = True
        self.is_conti = True
        self.defined = False
        
    def kill(self):
        """
        set column to kill mode, that it would not be involved in the learning
        """
        self.defined = True
        self.use = False
        
    def conti(self):
        """
        set column to contineous data
        """
        self.defined = True
        self.is_conti = True
        
    def disc(self):
        """
        set column to discrete data
        """
        self.defined = True
        self.is_conti = False
        
    def is_number(self):
        """
        Is this column's data type in any form of number
        """
        return self.col.dtype in (int,float,
                              np.float16,np.float32,np.float64,np.float64,
                              np.int0,np.int8,np.int16,np.int32,np.int64)
    
    def __bool__(self):
        """
        is this column going to join the learning
        """
        return self.use
    
    def __len__(self):
        """
        width of column when entering the model, or used as target
        """
        if self.is_conti:
            return 1
        else:
            if self.is_emb:
                return self.hidden_size
            else:
                width = len(self.top_freq)
                width =1 if width==2 else width
                return width
    
    def __repr__(self,):
        return f"<Rich Column:{self.name}>"
    
    def top_freq_(self):
        freq = self.freq()
        self.top_freq = freq[freq[self.name]>=self.min_occur].reset_index()
        return self.top_freq
    
    def freq(self):
        return pd.DataFrame(data=self.col.value_counts())
    
    @property
    def conf_dict(self):
        return dict((i,getattr(self,i)) for i in ["name","defined","is_conti","is_y","is_emb","use"])
    
    def set_conf(self,conf_dict):
        for k,v in conf_dict.items():
            setattr(self,k,v)
        return self
    
class RichDF(object):
    """
    A pandas dataframe manager
    """
    def __init__(self,df,fname=None):
        self.df = df
        self.columns = dict()
        if fname==None:
            fname=f"why_hat_{self.ts_str}"
        self.t = tracker("torchember",fname)
        self.t.data = self.t.log_path
        for colname in self.df:
            self.columns.update({colname:RichColumn(df[colname])})
            
    @property
    def ts_str(self):
        return datetime.now().strftime("%m%d_%H%M%S")
        
    @property
    def col_conf(self):
        return dict((k,{"use":v.use,"is_cont":v.is_conti}) for k,v in self.columns.items())
        
    def kill(self,colname):
        self.df[colname].rc.kill()
        
    def conti(self,colname):
        self.df[colname].rc.conti()
        
    def disc(self,colname):
        self.df[colname].rc.disc()
        
    def save_col(self,rcol):
        self.t[md5hash(rcol.name)]=rcol.conf_dict
        
    def set_col(self,rcol):
        if rcol.defined:
            print(f"{rcol.name} defined, use:{rcol.use}, contineus?:{rcol.is_conti}")
        print(color.bold("="*30))
        print(color.cyan(rcol.name))
        print(color.red(f"number? {rcol.is_number()}"))
        print(rcol.top_freq_().head(5))
              
        print(color.red("Is this a [C]ontineous, [D]iscrete or a column we do[N]'t need? default N"))
        x = input().lower()
        if x=="c":
            rcol.conti()
            print(color.blue(f"{rcol.name} set to contineous data"))
            self.save_col(rcol)
        elif x =="d":
            rcol.disc()
            print(color.blue(f"{rcol.name} set to discrite data"))
            self.save_col(rcol)
        elif (x =="") or (x=="n"):
            rcol.kill()
            print(color.blue(f"{rcol.name} will not be involved in learning"))
            self.save_col(rcol)
        else:
            print(color.yellow(f"option [{x}] not found, try Again?"))
            
    def save(self,colname):
        col=self.df[colname]
        self.t[md5hash(colname)] = col.rc.conf_dict
        
    def tour(self):
        """
        Go through column 1 by 1 to decide the processing for its data
        """
        for colname in self.df:
            col = self.df[colname]
            current = self.t[md5hash(colname)]
            if current != None:
                col.rc.set_conf(current)
            if col.rc.defined==False:
                self.set_col(col.rc)
                
    def set_y(self, *colnames):
        """
        set columns to y
        all the columns that use==True and is_y==False will be treated as x
        """
        for colname in colnames:
            rc = self.columns[colname]
            rc.is_y = True
            rc.use = True
            
    def set_x(self, *colnames):
        """
        set columns to x
        of course,every columns' default status is x, 
        so you don't have to set this if you accidentally set x to y
        """
        for colname in colnames:
            rc = self.columns[colname]
            rc.use = True
            rc.is_y = False

In [7]:
rdf = RichDF(df,fname = "testing_case_nyc")

In [9]:
rdf.tour()