In [14]:
import pandas as pd
import numpy as np
from typing import List, Dict, Union, Tuple
from functools import reduce

from pandas import DataFrame as DF
#from altair import Chart # for type aliasing

gh_raw_prefix = 'https://raw.githubusercontent.com/quinn-dougherty/well/master/'

csv_local = ['train_features.csv', 'test_features.csv', 'train_labels.csv', 'sample_submission.csv']
csv_github = {x: gh_raw_prefix + x for x in csv_local}

df = pd.read_csv(csv_github[csv_local[0]]).sample(2000)
y = pd.read_csv(csv_github[csv_local[2]])

to_drop = ['scheme_name', 'wpt_name', 'name']

null_vals_list = ['Not Known', 'Unknown', 'None', 'Not known', 'not known',
                  '-', 'unknown', 'Unknown Installer', '##', 'none', '0']

boolski = ['public_meeting', 'permit']

to_bin = {'construction_year': [-1, 1980, 1990, 2000, 2010, 2020], 
          'population': [-1, 10, 20, 100, 250, 1000, 5000, 10000, 100000]}

insigs = ['funder', 'installer', 'subvillage', 'ward']

dates = ['date_recorded']

twoprodc = ['region', 'district_code']
t = (twoprodc[0], twoprodc[1])

def const_impute(datf: DF, nl: List[str], const: str = "NOT_KNOWN") -> DF: 
  '''This is a naive and destructive fillna impute. 
  Given a dataframe and a list of strings known to be synonymous with "null"
  mutate the dataframe by replacing both missing vals and occurrences of vals in the given list 
  with a constant such as "NOT_KNOWN"
  
  also return the dataframe. '''
  return datf.replace({x: np.nan for x in nl}).fillna(const)

def bool_to_numeric(datf: DF, feats: List[str], unknown: float = 0.5) -> DF:
  """Bool to numeric"""
  mapped = datf[feats].replace({True: 1, False: 0, 'NOT_KNOWN': unknown})
  return datf.assign(**{name: mapped[name] for name in feats})

def binnit(datf: DF, 
           feats: Dict[str, List[int]], 
           ordinal: bool = True, 
           todrop: List[str] = to_drop) -> DF: 
  ''' binning'''
  assert all([len(x)<26 for x in feats.values()])
  
  k: List[str] = list(feats.keys())

  todrop += k
  
  j: int = 2 * min([len(x) for x in k]) // 3
  
  if ordinal: 
    labels: Dict[str, List[int]] = {name: list(range(1,len(feats[name]))) for name in k}
  else: 
    labels: Dict[str, List[int]] = {name: [ch for ch 
                                           in "abcdefghijklmnopqrstuvwxyz"]
                                    [:len(feats[name])-1] 
              for name in k}
  
  for name in k: 
    a: str = name[:j]+'_binned'
    feats[a] = feats[name]
    
  return datf.assign(**{name[:j]+'_binned': pd.cut(datf[name],
                                                 bins=feats[name],#break
                                                 labels=labels[name]) 
                        for name in k})


def date_to_ord(datf: DF, feats: Union[str, List[str]]) -> DF: 
  if isinstance(feats, str): 
    feats = [feats]

  return datf.assign(**{name: lambda df: (pd
                                          .to_datetime(df[name])
                                          .apply(lambda x: x.toordinal())) 
                        for name in feats})

def product(datf: DF, feats: Tuple[str,str], todrop: List[str] = to_drop) -> DF: 
  ''' df['region_district'] = df.apply(lambda row: f'{row.region}_{row.district_code}', axis=1) '''
  m = min([len(x) for x in feats])
  
  todrop += list(feats)
  
  name = f'{feats[0][:m]}_{feats[1][:m]}'
  
  return datf.assign(name = lambda df: [f'{le}_{ri}' 
                                        for le,ri in zip(df[feats[0]], df[feats[1]])])
  
# df['region_district'] = df.apply(lambda row: f'{row.region}_{row.district_code}', axis=1)
  pass


def insignificant(datf: DF, 
                  insgs: List[str], 
                  thresh: int = 3, 
                  fillconst: str = "OTHER") -> DF: 
  ''' WARNING: this code doesnt work without impute coming before it! '''
  return datf.assign(**{name: [val 
                               if (datf[name].str.lower()
                                   .value_counts()[val]) > thresh
                               else fillconst
                               for val 
                               in datf[name].str.lower()] 
                        for name in insgs})


def droppem(datf: DF, droppin: List[str]) -> DF:
  geog_outliers = [abs(x)+abs(y)>np.exp(-6) 
                   for x,y in zip(datf.longitude, datf.latitude)]
  return datf.drop(droppin, axis=1)[geog_outliers]


X = droppem(product(insignificant(date_to_ord(binnit(bool_to_numeric(const_impute(df, 
                                                                                  null_vals_list), 
                                                                     boolski), 
                                                     to_bin), 
                                              dates), 
                                  insigs, thresh=2), 
                    t), 
            to_drop).replace({"NOT_KNOWN": np.nan})

print(to_drop)
print(X.shape)

X.head()

['scheme_name', 'wpt_name', 'construction_year', 'population', 'region', 'district_code']
(1930, 37)


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,num_private,basin,...,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,constr_binned,popula_binned,name
40568,72762,300.0,734216,co,305,OTHER,36.995132,-7.597195,0,Rufiji,...,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,5,3,Morogoro_1
46670,61339,50.0,734909,world bank,1077,OTHER,34.195845,-4.357662,0,Internal,...,enough,enough,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,4,5,Singida_1
39799,16279,500.0,734196,world bank,334,OTHER,37.547361,-6.2473,0,Wami / Ruvu,...,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,4,5,Morogoro_6
27605,66337,0.0,734337,kkkt,0,kkkt,32.877581,-9.055646,0,Lake Rukwa,...,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,1,1,Mbeya_6
40463,4306,0.0,734352,OTHER,0,dwe,30.656344,-1.273769,0,Lake Victoria,...,insufficient,insufficient,shallow well,shallow well,groundwater,other,other,1,1,Kagera_1


In [28]:

#X.to_csv("data", index=False)
y = pd.read_csv(csv_github[csv_local[2]])


#X.join(y, on='id')
X.merge(y.replace({"functional": 1, "non functional": -1, "functional needs repair": 0}), on='id').to_csv('data.csv', index=False)

In [19]:
y.shape

(59400, 2)