In [46]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

from torch import optim, nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from numpy import genfromtxt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.impute import KNNImputer
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb


import copy
%matplotlib inline

rng = 0

test_values = pd.read_csv("./test-set-values.csv")
train_values = pd.read_csv("./training-set-values.csv")
train_labels = pd.read_csv("./training-set-labels.csv")


In [47]:
print("columns:")
print(train_values.columns.values)
print()
print("shape"+str(train_values.shape))
print()
print(train_values.info())


columns:
['id' 'amount_tsh' 'date_recorded' 'funder' 'gps_height' 'installer'
 'longitude' 'latitude' 'wpt_name' 'num_private' 'basin' 'subvillage'
 'region' 'region_code' 'district_code' 'lga' 'ward' 'population'
 'public_meeting' 'recorded_by' 'scheme_management' 'scheme_name' 'permit'
 'construction_year' 'extraction_type' 'extraction_type_group'
 'extraction_type_class' 'management' 'management_group' 'payment'
 'payment_type' 'water_quality' 'quality_group' 'quantity'
 'quantity_group' 'source' 'source_type' 'source_class' 'waterpoint_type'
 'waterpoint_type_group']

shape(59400, 40)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 

In [48]:

d = {
    "date_recorded": "string",
    "funder": "string",
    "installer": "string",
    "wpt_name": "string",
    "basin": "string",
    "subvillage": "string",
    "region": "string",
    "lga": "string",
    "ward": "string",
    "public_meeting": "string",
    "recorded_by": "string",
    "scheme_management": "string",
    "scheme_name": "string",
    "permit": "string",
    "extraction_type": "string",
    "extraction_type_group": "string",
    "extraction_type_class": "string",
    "management": "string",
    "management_group": "string",
    "payment": "string",
    "payment_type": "string",
    "water_quality": "string",
    "quality_group": "string",
    "quantity": "string",
    "quantity_group": "string",
    "source": "string",
    "source_type": "string",
    "source_class": "string",
    "waterpoint_type": "string",
    "waterpoint_type_group": "string",
}

train_values = train_values.astype(d)
train_values["date_recorded"] = pd.to_datetime(
    train_values["date_recorded"],
    infer_datetime_format=True
)
train_values.dtypes


id                                int64
amount_tsh                      float64
date_recorded            datetime64[ns]
funder                           string
gps_height                        int64
installer                        string
longitude                       float64
latitude                        float64
wpt_name                         string
num_private                       int64
basin                            string
subvillage                       string
region                           string
region_code                       int64
district_code                     int64
lga                              string
ward                             string
population                        int64
public_meeting                   string
recorded_by                      string
scheme_management                string
scheme_name                      string
permit                           string
construction_year                 int64
extraction_type                  string


In [49]:
print("any nulls in train_values?")
print(pd.isnull(train_values).any()[lambda x: x == True])
print()
print("any NANs in train_values?")
print(pd.isna(train_values).any()[lambda x:x == True])


any nulls in train_values?
funder               True
installer            True
subvillage           True
public_meeting       True
scheme_management    True
scheme_name          True
permit               True
dtype: bool

any NANs in train_values?
funder               True
installer            True
subvillage           True
public_meeting       True
scheme_management    True
scheme_name          True
permit               True
dtype: bool


PS": The features not shown are false for both cases

## train_values have the same features with both NANs and nulls.

In [50]:
train_values.iloc[:, :10].head()


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0


In [51]:
train_values.iloc[:, 11:20].head()


Unnamed: 0,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by
0,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd
1,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd
2,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd
3,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd
4,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd


In [52]:
train_values.iloc[:, 21:30].head()


Unnamed: 0,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment
0,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually
1,,True,2010,gravity,gravity,gravity,wug,user-group,never pay
2,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket
3,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay
4,,True,0,gravity,gravity,gravity,other,other,never pay


In [53]:
train_values.iloc[:,31:40].head()


Unnamed: 0,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [56]:
numerical_variables = [
    "amount_tsh",
    "gps_height",
    "longitude",
    "latitude",
    "num_private",
    "region_code",
    "district_code",
    "population",
    "construction_year",
]


In [57]:
train_values[numerical_variables]

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999
1,0.0,1399,34.698766,-2.147466,0,20,2,280,2010
2,25.0,686,37.460664,-3.821329,0,21,4,250,2009
3,0.0,263,38.486161,-11.155298,0,90,63,58,1986
4,0.0,0,31.130847,-1.825359,0,18,1,0,0
...,...,...,...,...,...,...,...,...,...
59395,10.0,1210,37.169807,-3.253847,0,3,5,125,1999
59396,4700.0,1212,35.249991,-9.070629,0,11,4,56,1996
59397,0.0,0,34.017087,-8.750434,0,12,7,0,0
59398,0.0,0,35.861315,-6.378573,0,1,4,0,0


In [58]:
sns.pairplot(train_values[numerical_variables],kind="scatter")

In [None]:
print("any nulls or NANs in train_labels?")
print(pd.isnull(train_labels).any())
print(pd.isna(train_labels).any())
