### Imports

In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction import FeatureHasher
import category_encoders as ce
pd.set_option('display.max_columns', None)

### Loading and viewing the data

In [None]:
train_raw = pd.read_csv('./Data/training_set_values.csv')
test_raw = pd.read_csv('./Data/test_set_values.csv')

train_raw['train'] = 1
test_raw['train'] = 0
data = pd.concat([train_raw, test_raw])

In [None]:
# Overview

data.info()

### Global variables

In [None]:
data_version = '0.0'

int_var = ['population','gps_height', 'construction_year']
float_var = ['amount_tsh','longitude']

features_to_drop = ['num_private','recorded_by']

null_features = ['longitude','latitude','gps_height','population','construction_year','amount_tsh']
#no calculations for num_private since they are dropped later (too many missing values)

divisions = ['region', 'ward']
divisions_total = ['ward', 'region', 'overall']

# These will be scaled
num_features = ['latitude','longitude','operation_years','amount_tsh', 'gps_height', 'population']

# These will be factorized
cat_features = list(data.columns)
for feature in num_features:
    cat_features.remove(feature)
for feature in features_to_drop:
    cat_features.remove(feature)
cat_features.remove('train')

### Data preparation

In [None]:
# Identify missing values in numerical data

for var in int_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0]))

for var in float_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0.0]))

print('latitude:')
display(data['latitude'].min())
display(len(data[(data['latitude'] > -0.001) & (data['latitude'] < 0.001)]))

In [None]:
# Replace zeros by NaN

for var in int_var:
    data[var].replace(0, np.nan, inplace=True)
    
for var in float_var:
    data[var].replace(0.0, np.nan, inplace=True)

data['latitude'].where((data['latitude'] < -0.001) | (data['latitude'] > 0.001), other= np.nan, inplace=True,axis=0)

In [None]:
# Logarithmic scaling of amount_tsh and population

data['amount_tsh']=data.apply(lambda row: np.log1p(row['amount_tsh']),axis=1)
data['population']=data.apply(lambda row: np.log1p(row['population']),axis=1)

In [None]:
# Split train and test data

train = data[data['train'] == 1]
test = data[data['train'] == 0]

### Imputation of missing values in numerical features

In [None]:
# Duplicate critical columnns for imputation based on normal distribution and random choice

for null_feature in null_features:
    #data['_'.join([null_feature, 'imp_mean-median'])] = data[null_feature]
    data['_'.join([null_feature, 'imp_normal'])] = data[null_feature]
    data['_'.join([null_feature, 'imp_random_choice'])] = data[null_feature]

In [None]:
data.head()

#### Imputation of numerical features by normal distribution

In [None]:
# Add columns for mean and standard deviation of critical features based on 'region', 'ward' and 'overall'

for null_feature in null_features:
    data['_'.join([null_feature, 'mean', 'overall'])] = train[null_feature].mean()
    data['_'.join([null_feature, 'std', 'overall'])] = train[null_feature].std()
    for division in divisions:
        new_feature_name_mean = '_'.join([null_feature, 'mean', division])
        new_feature_name_std = '_'.join([null_feature, 'std', division])
        
        calcs_mean = train.groupby(division)[null_feature].mean()
        calcs_std = train.groupby(division)[null_feature].std()
        for value in train[division].unique() :
            data.loc[data[division]==value, new_feature_name_mean] = calcs_mean[value]
            data.loc[data[division]==value, new_feature_name_std] = calcs_std[value]

In [None]:
# 1st step: Impute missing values with random numbers generated by normal distribution based on mean, std by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'overall'

for null_feature in null_features:
    for division in divisions_total:
        data['_'.join([null_feature,'imp_normal'])] = data.apply(lambda row: np.random.normal(loc=row['_'.join([null_feature,'mean',division])], scale=row['_'.join([null_feature,'std',division])]) if math.isnan(row['_'.join([null_feature,'imp_normal'])]) else row['_'.join([null_feature,'imp_normal'])], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_normal'])].isnull().sum()))

In [None]:
data.head()

#### Imputation of numerical features by random choice

In [None]:
# Add columns with list of values in corresponding group of 'region' and 'ward', respectively

for null_feature in null_features:
    overall_list = list(train[null_feature])
    overall_list = [x for x in overall_list if not math.isnan(x)]
    data['_'.join([null_feature, 'list', 'overall'])] = data.apply(lambda row: overall_list, axis=1)
    display(null_feature, 'overall list done')
    for division in divisions:
        feature_name = '_'.join([null_feature, 'list', division])
        lists = train.groupby(division)[null_feature].apply(list)
        data[feature_name] = data.apply(lambda row: list() if row[division] not in train[division].unique() else lists[row[division]], axis=1)
        data[feature_name] = data[feature_name].apply(lambda lst: [x for x in lst if not math.isnan(x)])
        data[feature_name] = data[feature_name].apply(lambda x: np.nan if not x else x)
        display('List for {} by {} created'.format(null_feature, division))
        

In [None]:
# 1st step: Impute missing values with empirical distribution grouped by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'overall'

for null_feature in null_features:
    for division in divisions_total:        
        #data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: np.random.choice(a=row['_'.join([null_feature,'list',division])]) if math.isnan(row['_'.join([null_feature,'imp_random_choice'])]) else row['_'.join([null_feature,'imp_random_choice'])], axis=1)
        data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: row['_'.join([null_feature,'imp_random_choice'])] if not np.isnan(row['_'.join([null_feature,'imp_random_choice'])]).any() else (np.random.choice(a=row['_'.join([null_feature,'list',division])]) if not np.isnan(row['_'.join([null_feature,'list',division])]).any() else np.nan), axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_random_choice'])].isnull().sum()))
        

#### Imputation of numerical features by mean/median

In [None]:
# Add columns for median of critical integer features based on 'region', 'ward', 'overall'

float_var.append('latitude')

for var in int_var:
    data['_'.join([var, 'median', 'overall'])] = train[var].median()
    for division in divisions:
        new_feature_name_median = '_'.join([var, 'median', division])
        calcs_median = train.groupby(division)[var].median()
        for value in train[division].unique() :
            data.loc[data[division]==value, new_feature_name_median] = calcs_median[value]

In [None]:
# 1st step: Impute missing values with mean and median by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with mean and median by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with overall mean and median

for var in float_var:
    for division in divisions_total:
        data[var] = data.apply(lambda row: row['_'.join([var,'mean',division])] if math.isnan(row[var]) else row[var], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(var, division, data[var].isnull().sum()))

for var in int_var:
    for division in divisions_total:
        data[var] = data.apply(lambda row: row['_'.join([var,'median',division])] if math.isnan(row[var]) else row[var], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(var, division, data[var].isnull().sum()))
        

In [None]:
data.isnull().sum()

#### Load imputed data from previous data files and add mean/median imputation as additional columns
Can be used if only categorical features will be changed

In [None]:
'''imputed_train = pd.read_csv('./Data/train_cleaned_v0.1.csv')
imputed_test = pd.read_csv('./Data/test_cleaned_v0.1.csv')

imputed_data = pd.concat([imputed_train, imputed_test])'''
#imputed_data.head()

In [None]:
'''keep_columns = 'id longitude latitude gps_height amount_tsh population construction_year'.split()
for col in 'longitude latitude gps_height amount_tsh population construction_year'.split():
    keep_columns.append('_'.join([col, 'imp_normal']))
    keep_columns.append('_'.join([col, 'imp_random_choice']))'''

In [None]:
#imputed_data = imputed_data[keep_columns]
#imputed_data.head()

In [None]:
'''columns = imputed_data.columns
new_columns = list()
for col in columns:
    if 'imp' in col or col == 'id':
        new_columns.append(col)
    else:
        new_columns.append('_'.join([col, 'imp_mean-median']))'''

In [None]:
'''imputed_data.columns = new_columns
imputed_data.head()'''

In [None]:
#data = data.merge(imputed_data, on='id')

In [None]:
'''for feature in null_features:
    data[feature] = data['_'.join([feature, 'imp', 'mean-median'])]
    data.drop('_'.join([feature, 'imp', 'mean-median']), inplace=True, axis=1)'''

In [None]:
#data.isnull().sum()

### Feature generation

In [None]:
# Create new feature that gives information about operational time

imputation_methods = ['normal', 'random_choice']
data['date_recorded'] = pd.to_datetime(data['date_recorded'])

data['operation_years'] = data['date_recorded'].dt.year - data['construction_year']

for method in imputation_methods:
    data['_'.join(['operation_years_imp', method])] = data['date_recorded'].dt.year - data['_'.join(['construction_year_imp', method])]
    data['_'.join(['operation_years_imp', method])] = data['_'.join(['operation_years_imp', method])].astype(int)

In [None]:
data.head()

### Scaling of numerical features

In [None]:
# Scale numerical features
'''
scaler = MinMaxScaler()

for s in split:
    s[num_features] = scaler.fit_transform(s[num_features])
'''

### Drop irrelevant features

In [None]:
# Drop columns used for imputation and generation of random numbers

drop_columns = list()
measures = 'mean std list'.split()
for null_feature in null_features:
    for division in divisions_total:
        for measure in measures:
            drop_columns.append('_'.join([null_feature, measure, division]))
            
for var in int_var:
    for division in divisions_total:
        for measure in measures:
            drop_columns.append('_'.join([var, 'median', division]))
            
data.drop(columns=drop_columns, inplace=True)

In [None]:
# Drop redundant features and features that do not seem to have an impact

data.drop(columns=features_to_drop,axis=1,inplace=True)
data.head()

### Preparation of categorical features

No grouping and factorizing of categorical features in data version 0.1; missing values replaced by 'Missing' only

#### Replace unique values that only appear in the test data set 
Either by 'Others' or by the most frequent value in the corresponding column of the train data set

In [None]:
for cat in cat_features:
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'data', data[cat].nunique()))

In [None]:
# Get list of features that contain values in the test which don't appear in the training data

unique_test_features = list()
for cat in cat_features:
    if train[cat].nunique() < data[cat].nunique():
        unique_test_features.append(cat)
unique_test_features

In [None]:
# Set unique values that appear in the test dataset only to 'Others' if 'Others' appears in the training dataset, else set it to most frequent value in the corresponding column

for feature in unique_test_features:
    train_values = train[feature].unique().tolist()
    test_values = test[feature].unique().tolist()
    merged_values = pd.DataFrame(train_values).merge(pd.DataFrame(test_values), how='right', indicator=True)
    unique_test_values = list(merged_values[0].loc[merged_values['_merge'] == 'right_only'])
    replace_value = 'Others' if 'Others' in train_values else train[feature].mode()
    for value in unique_test_values:
        data.loc[data[feature] == value, feature] = replace_value  

In [None]:
for cat in cat_features:
    display('{}, {}: {}'.format(cat, 'NaNs', data[cat].isnull().sum()))
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'overall', data[cat].nunique()))

#### Group feature categories other than top categories into 'Others'

In [None]:
# Reduce dimension of categorical variables with high dimensionality 

'''dim_red_features = 'funder installer scheme_name lga ward'.split()
for feature in dim_red_features:
    train = train.assign(count = train.groupby(feature)[feature].transform('count')).sort_values(by = ['count',feature], ascending = [False,True])
    top_values = train.drop_duplicates('count')
    top_values = list(top_values.nlargest(10, 'count')[feature])
    data[feature] = data[feature].apply(lambda x: x if (x in top_values) | (str(x) == 'nan') else 'Others')'''

In [None]:
'''for cat in cat_features:
    display('{}, {}: {}'.format(cat, 'NaNs', data[cat].isnull().sum()))
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'overall', data[cat].nunique()))'''

#### Replace missing values by 'Missing'

In [None]:
# Fill missing values in categorical features by 'Missing'

for feature in cat_features:
    data[feature].replace(np.nan, 'Missing', inplace=True)

In [None]:
'''data['funder'].value_counts()
data['installer'].value_counts()
data['scheme_name'].value_counts()'''

In [None]:
data.head()

#### Convert categorical features into numerical features by adding a column with their probability for each target class

In [None]:
'''train_labels = pd.read_csv('./Data/training_set_labels.csv')
train = train.merge(train_labels, on="id")
train.head()'''

In [None]:
'''for feature in cat_features:
    train['count'] = train.groupby(feature)[feature].transform('count')
train.head()'''

In [None]:
'''def get_percentage(groups, row, status_group, feature):
    try:
        sg_count = groups['count'].loc[(groups[feature] == row[feature]) & (groups['status_group'] == status_group)].item()
        total_count = train['count'].loc[train['id'] == row['id']].item()
        return sg_count / total_count
    except ValueError:
        return 0'''

In [None]:
'''status_groups = 'functional,non functional,functional needs repair'.split(',')
for feature in cat_features:
    groups = pd.DataFrame({'count': train.groupby([feature, 'status_group']).size()}).reset_index()
    for status_group in status_groups:
        data['_'.join(['pct', feature, status_group])] = data.apply(lambda row: get_percentage(groups=groups, row=row, status_group=status_group, feature=feature), axis=1)
    display(feature + ' done')'''
        

In [None]:
#data.head()


#### Factorize categorical features

In [None]:
# Factorize features for evaluations

for feature in cat_features:
    data[feature] = pd.factorize(data[feature])[0]
data.head()

#### OneHotEncoding

In [None]:
# Remove 'subvillage' from cat_features its number of unique values leads to too many dummies
#cat_features.remove('subvillage')
# Get dummies for categorical features and add them to dataframe
#data = pd.concat([data, pd.get_dummies(data[cat_features], dummy_na=True)], axis=1)

In [None]:
#data.shape

In [None]:
#data.head()

#### Binary Encoding

In [None]:
#ce_bin = ce.BinaryEncoder(cols=cat_features)
#data = ce_bin.fit_transform(data)

In [None]:
#data.shape

In [None]:
#data.head()

#### Feature Hashing

In [None]:
#ce_hash = ce.HashingEncoder(cols=cat_features, n_components=len(cat_features * 20))
#data = ce_hash.fit_transform(data)

In [None]:
#data.head()

### Split data into train and test data set

In [None]:
train_df = data[data["train"] == 1]
test_df = data[data["train"] == 0]

train_df.drop(["train"], axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

### Save data to csv files

In [None]:
pd.DataFrame(train_df).to_csv("./Data/train_cleaned_v" + data_version + ".csv", index=False)
pd.DataFrame(test_df).to_csv("./Data/test_cleaned_v" + data_version + ".csv", index=False)