In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder

In [3]:
target = pd.read_csv('../data/cleaned_water_Modeling.csv')

In [4]:
target.drop(columns='Unnamed: 0', inplace=True)

In [5]:
target.head()

Unnamed: 0,status_group,amount_tsh,funder,gps_height,installer,basin,region,lga,ward,population,...,permit,extraction_type_class,management,payment_type,water_quality,quantity,source,source_class,waterpoint_type,age
0,0,6000.0,Roman,1390,Roman,Lake Nyasa,Iringa,Ludewa,Mundindi,109,...,False,gravity,vwc,annually,soft,enough,spring,groundwater,communal standpipe,21
1,0,25.0,Lottery Club,686,World vision,Pangani,Manyara,Simanjiro,Ngorika,250,...,True,gravity,vwc,per bucket,soft,enough,dam,surface,communal standpipe multiple,11
2,1,0.0,Unicef,263,UNICEF,Ruvuma / Southern Coast,Mtwara,Nanyumbu,Nanyumbu,58,...,True,submersible,vwc,never pay,soft,dry,machine dbh,groundwater,communal standpipe multiple,34
3,0,20.0,Mkinga Distric Coun,0,DWE,Pangani,Tanga,Mkinga,Moa,1,...,True,submersible,vwc,per bucket,salty,enough,other,unknown,communal standpipe multiple,11
4,1,0.0,Dwsp,0,DWSP,Internal,Shinyanga,Shinyanga Rural,Samuye,0,...,True,handpump,vwc,never pay,soft,enough,machine dbh,groundwater,hand pump,23


In [6]:
X = target.drop(columns='status_group')
y = target['status_group']

#### Train, test, split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 77, 
                                                    stratify = y, # in classification problems 
                                                                  # when you split the data 
                                                                  # you want to keep the ratio in the classes.
                                                    test_size = .2 # This is usually the ratio but it might change 
                                                                   # according to the problem at hand.
                                                   )

#### Extracting feature names

In [8]:
cat_cols = X.select_dtypes(include=['object','bool']).columns.tolist()
cont_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols,cont_cols

(['funder',
  'installer',
  'basin',
  'region',
  'lga',
  'ward',
  'public_meeting',
  'scheme_management',
  'permit',
  'extraction_type_class',
  'management',
  'payment_type',
  'water_quality',
  'quantity',
  'source',
  'source_class',
  'waterpoint_type'],
 ['amount_tsh', 'gps_height', 'population', 'age'])

#### OHE

In [9]:
## create an encoder object. This will help us to convert
## categorical variables to new columns
ss = StandardScaler()
encoder = OneHotEncoder(handle_unknown= 'ignore',
                        #drop='first',
                        categories= 'auto')

## Create an columntransformer object.
## This will help us to merge transformed columns
## with the rest of the dataset.

ct = ColumnTransformer(transformers =[('ohe', encoder, cat_cols)], #('scaler', ss, cont_cols)],
                                    remainder= ss)
X_train_ohe = ct.fit_transform(X_train)
X_test_ohe = ct.transform(X_test)

#### Export splits

In [10]:
X_train_ohe = pd.DataFrame.sparse.from_spmatrix(X_train_ohe)

In [11]:
X_test_ohe = pd.DataFrame.sparse.from_spmatrix(X_test_ohe)

In [12]:
## Export X_train, X_test
#X_train.to_csv('../data/X_train.csv')
#X_train_ohe.to_csv('../data/X_train_ohe.csv')
#X_test.to_csv('../data/X_test.csv')
X_train_ohe.to_pickle('../data/X_train_ohe.pkl')
X_test_ohe.to_pickle('../data/X_test_ohe.pkl')

## Export y_train, y_test
y_train.to_pickle('../data/y_train.pkl')
y_test.to_pickle('../data/y_test.pkl')

# pickle_out = open('../data/y_train.pickle', 'wb')
# pickle.dump(y_train, pickle_out)

# pickle_out = open('../data/y_test.pickle', 'wb')
# pickle.dump(y_test, pickle_out)