### Imports

In [None]:
import pandas as pd
import numpy as np
import math
pd.set_option('display.max_columns', None)

### Loading and viewing the data

In [None]:
train_raw = pd.read_csv('./Data/training_set_values.csv')
test_raw = pd.read_csv('./Data/test_set_values.csv')

train_raw['train'] = 1
test_raw['train'] = 0

data = pd.concat([train_raw, test_raw])


In [None]:
# Overview

data.info()

### Global variables

In [None]:
data_version = '0.0'

int_var = ['population','gps_height', 'construction_year']
float_var = ['amount_tsh','longitude']

features_to_drop = ['num_private','recorded_by']

null_features = ['longitude','latitude','gps_height','population','construction_year','amount_tsh']

divisions = ['region', 'ward']
divisions_total = ['ward', 'region', 'overall']

num_features = ['latitude','longitude','operation_years','amount_tsh', 'gps_height', 'population']

cat_features = 'funder installer wpt_name basin subvillage region lga ward public_meeting scheme_management scheme_name permit extraction_type extraction_type_group extraction_type_class management management_group payment payment_type water_quality quality_group quantity quantity_group source source_type source_class waterpoint_type waterpoint_type_group'.split()



In [None]:
data.head()

### Data preparation

In [None]:
# Identify missing values in numerical data

for var in int_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0]))

for var in float_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0.0]))

print('latitude:')
display(data['latitude'].min())
display(len(data[(data['latitude'] > -0.001) & (data['latitude'] < 0.001)]))

In [None]:
# Replace zeros by NaN

for var in int_var:
    data[var].replace(0, np.nan, inplace=True)
    
for var in float_var:
    data[var].replace(0.0, np.nan, inplace=True)

data['latitude'].where((data['latitude'] < -0.001) | (data['latitude'] > 0.001), other= np.nan, inplace=True,axis=0)

In [None]:
# Logarithmic scaling of amount_tsh and population

data['amount_tsh']=data.apply(lambda row: np.log1p(row['amount_tsh']),axis=1)
data['population']=data.apply(lambda row: np.log1p(row['population']),axis=1)

In [None]:
# Convert 'date_recorded' from datetime to year only
data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data['date_recorded'] = data['date_recorded'].dt.year

In [None]:
# Split train and test data

train = data[data['train'] == 1]
test = data[data['train'] == 0]

### Imputation of missing values in numerical features

In [None]:
# Duplicate critical columnns for imputation based on normal distribution and random choice

for null_feature in null_features:
    data['_'.join([null_feature, 'imp_normal'])] = data[null_feature]
    data['_'.join([null_feature, 'imp_random_choice'])] = data[null_feature]

In [None]:
data.head()

#### Imputation of numerical features by normal distribution

In [None]:
# Add columns for mean and standard deviation of critical features based on 'region', 'ward' and 'overall'

for null_feature in null_features:
    data['_'.join([null_feature, 'mean', 'overall'])] = train[null_feature].mean()
    data['_'.join([null_feature, 'std', 'overall'])] = train[null_feature].std()
    for division in divisions:
        new_feature_name_mean = '_'.join([null_feature, 'mean', division])
        new_feature_name_std = '_'.join([null_feature, 'std', division])
        
        calcs_mean = train.groupby(division)[null_feature].mean()
        calcs_std = train.groupby(division)[null_feature].std()
        for value in train[division].unique() :
            data.loc[data[division]==value, new_feature_name_mean] = calcs_mean[value]
            data.loc[data[division]==value, new_feature_name_std] = calcs_std[value]

In [None]:
# 1st step: Impute missing values with random numbers generated by normal distribution based on mean, std by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'overall'

for null_feature in null_features:
    for division in divisions_total:
        data['_'.join([null_feature,'imp_normal'])] = data.apply(lambda row: np.random.normal(loc=row['_'.join([null_feature,'mean',division])], scale=row['_'.join([null_feature,'std',division])]) if math.isnan(row['_'.join([null_feature,'imp_normal'])]) else row['_'.join([null_feature,'imp_normal'])], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_normal'])].isnull().sum()))

In [None]:
data.head()

#### Imputation of numerical features by random choice

In [None]:
# Add columns with list of values in corresponding group of 'region' and 'ward', respectively

for null_feature in null_features:
    overall_list = list(train[null_feature])
    overall_list = [x for x in overall_list if not math.isnan(x)]
    data['_'.join([null_feature, 'list', 'overall'])] = data.apply(lambda row: overall_list, axis=1)
    display(null_feature, 'overall list done')
    for division in divisions:
        feature_name = '_'.join([null_feature, 'list', division])
        lists = train.groupby(division)[null_feature].apply(list)
        data[feature_name] = data.apply(lambda row: list() if row[division] not in train[division].unique() else lists[row[division]], axis=1)
        data[feature_name] = data[feature_name].apply(lambda lst: [x for x in lst if not math.isnan(x)])
        data[feature_name] = data[feature_name].apply(lambda x: np.nan if not x else x)
        display('List for {} by {} created'.format(null_feature, division))
        

In [None]:
# 1st step: Impute missing values with empirical distribution grouped by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'overall'

for null_feature in null_features:
    for division in divisions_total:        
        #data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: np.random.choice(a=row['_'.join([null_feature,'list',division])]) if math.isnan(row['_'.join([null_feature,'imp_random_choice'])]) else row['_'.join([null_feature,'imp_random_choice'])], axis=1)
        data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: row['_'.join([null_feature,'imp_random_choice'])] if not np.isnan(row['_'.join([null_feature,'imp_random_choice'])]).any() else (np.random.choice(a=row['_'.join([null_feature,'list',division])]) if not np.isnan(row['_'.join([null_feature,'list',division])]).any() else np.nan), axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_random_choice'])].isnull().sum()))
        

#### Imputation of numerical features by mean/median

In [None]:
# Add columns for median of critical integer features based on 'region', 'ward', 'overall'

float_var.append('latitude')

for var in int_var:
    data['_'.join([var, 'median', 'overall'])] = train[var].median()
    for division in divisions:
        new_feature_name_median = '_'.join([var, 'median', division])
        calcs_median = train.groupby(division)[var].median()
        for value in train[division].unique() :
            data.loc[data[division]==value, new_feature_name_median] = calcs_median[value]

In [None]:
# 1st step: Impute missing values with mean and median by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with mean and median by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with overall mean and median

for var in float_var:
    for division in divisions_total:
        data[var] = data.apply(lambda row: row['_'.join([var,'mean',division])] if math.isnan(row[var]) else row[var], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(var, division, data[var].isnull().sum()))

for var in int_var:
    for division in divisions_total:
        data[var] = data.apply(lambda row: row['_'.join([var,'median',division])] if math.isnan(row[var]) else row[var], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(var, division, data[var].isnull().sum()))
        

In [None]:
data.isnull().sum()

### Grouping of train data (for problem-based preprocessing only)

In [None]:
# Split train and test data

train = data[data['train'] == 1]
test = data[data['train'] == 0]

In [None]:
# Show number of unique values in each categorical feature, if this number of unique values is larger than 125
# Show number of last 30% of unique values and number of levels with a frequency of only one

categorical_features = 'funder installer wpt_name basin subvillage region lga ward recorded_by scheme_management scheme_name extraction_type extraction_type_group extraction_type_class management management_group payment payment_type water_quality quality_group quantity quantity_group source source_type source_class waterpoint_type waterpoint_type_group'.split()
features_to_group = dict()
for feature in categorical_features:
    num = train[feature].nunique()
    if num > 125:
        last_30_pct = int(num*0.3)
        print('{}: {}, 30%: {}'.format(feature, num, last_30_pct))
        freq_one_counter = 0
        groups = train.groupby(feature).size()
        for i in range(len(groups)):
            if groups[i] == 1:
                freq_one_counter += 1
        features_to_group[feature] = [last_30_pct, freq_one_counter]
        print('   Levels with frequency of one: {}'.format(freq_one_counter))

In [None]:
# Group by aggregating by last 30% or levels with frequency of one - depending on which value of grouped levels is higher

for feature, values in features_to_group.items():
    values_30_pct = values[0]
    values_freq_one = values[1]
    total_levels = train[feature].nunique()
    print('{} - Total levels: {}; 30%: {}, levels with frequency of one: {}'.format(feature, total_levels, values_30_pct, values_freq_one))
    num_to_replace = values_freq_one if values_freq_one > values_30_pct else values_30_pct
    least_common_values = list(train[feature].value_counts().index[-num_to_replace:])
    train[feature] = train.apply(lambda row: 'Others' if row[feature] in least_common_values else row[feature], axis=1)
    print('{} - Number of grouped levels: {}, Total levels after grouping: {}'.format(feature, num_to_replace, train[feature].nunique()))
    

In [None]:
data = pd.concat([train, test])

### Balancing of train data (for problem-based preprocessing only)

In [None]:
# Split train and test data

train = data[data['train'] == 1]
test = data[data['train'] == 0]

In [None]:
# Add labels to train data set
labels = pd.read_csv('./Data/training_set_labels.csv')
train = train.merge(labels, on="id")
train.head()

In [None]:
# Show distribution of target class instances
train.groupby('status_group').size()

In [None]:
# Create new dataframes for each underrepresented target class
needs_repair_df = train.loc[train['status_group'] == 'functional needs repair']
non_functional_df = train.loc[train['status_group'] == 'non functional']

#### Add additional 'non functional' rows to train data set

In [None]:
# Add random rows for equal distribution of ‘non functional' class 
missing_non_functional = len(train.loc[train['status_group'] == 'functional']) - len(non_functional_df)
additional_non_functionals = non_functional_df.sample(n=missing_non_functional, axis=0)
train = pd.concat([train, additional_non_functionals])

In [None]:
# Show distribution of target class instances after balancing 'non functional' target class
train.groupby('status_group').size()

#### Add additional 'needs repair' rows to train data set

In [None]:
# Add additional full sets of 'needs repair' target class
missing_need_repairs = len(train.loc[train['status_group'] == 'functional']) - len(needs_repair_df)
for i in range(missing_need_repairs // len(needs_repair_df)):
    train = pd.concat([train, needs_repair_df])

In [None]:
# Show distribution of target class instances after first step of balancing 'needs repair' target class
train.groupby('status_group').size()

In [None]:
# Add random rows for equal distribution of ‘needs repair' class 
additional_needs_repairs = needs_repair_df.sample(n=missing_need_repairs % len(needs_repair_df), axis=0)
train = pd.concat([train, additional_needs_repairs])

In [None]:
# Show distribution of target class instances after first step of balancing 'needs repair' target class
train.groupby('status_group').size()

In [None]:
data = pd.concat([train, test])

### Feature generation

In [None]:
# Create new feature that gives information about operational time

imputation_methods = ['normal', 'random_choice']

data['operation_years'] = data['date_recorded'] - data['construction_year']

for method in imputation_methods:
    data['_'.join(['operation_years_imp', method])] = data['date_recorded'].dt.year - data['_'.join(['construction_year_imp', method])]
    data['_'.join(['operation_years_imp', method])] = data['_'.join(['operation_years_imp', method])].astype(int)
    

In [None]:
data.head()

### Drop irrelevant features

In [None]:
# Drop columns used for imputation and generation of random numbers

drop_columns = list()
measures = 'mean std list'.split()
for null_feature in null_features:
    for division in divisions_total:
        for measure in measures:
            drop_columns.append('_'.join([null_feature, measure, division]))
            
for var in int_var:
    for division in divisions_total:
        for measure in measures:
            drop_columns.append('_'.join([var, 'median', division]))
            
data.drop(columns=drop_columns, inplace=True)

In [None]:
# Drop redundant features and features that do not seem to have an impact (initial feature selection)

data.drop(columns=features_to_drop,axis=1,inplace=True)
data.head()

In [None]:
data.shape

### Preparation of categorical features

#### Replace unique values that only appear in the test data set 
Either by 'Others' or by the most frequent value in the corresponding column of the train data set

In [None]:
for cat in cat_features:
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'data', data[cat].nunique()))

In [None]:
# Get list of features that contain values in the test which don't appear in the training data

unique_test_features = list()
for cat in cat_features:
    if train[cat].nunique() < data[cat].nunique():
        unique_test_features.append(cat)
unique_test_features

In [None]:
# Set unique values that appear in the test dataset only to 'Others' if 'Others' appears in the training dataset, else set it to most frequent value in the corresponding column

for feature in unique_test_features:
    train_values = train[feature].unique().tolist()
    test_values = test[feature].unique().tolist()
    merged_values = pd.DataFrame(train_values).merge(pd.DataFrame(test_values), how='right', indicator=True)
    unique_test_values = list(merged_values[0].loc[merged_values['_merge'] == 'right_only'])
    replace_value = 'Others' if 'Others' in train_values else train[feature].mode()
    for value in unique_test_values:
        data.loc[data[feature] == value, feature] = replace_value    
        

In [None]:
for cat in cat_features:
    display('{}, {}: {}'.format(cat, 'NaNs', data[cat].isnull().sum()))
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'overall', data[cat].nunique()))

#### Replace missing values by 'Missing'

In [None]:
for feature in selected_cat_features:
    data[feature].replace(np.nan, 'Missing', inplace=True)

#### Factorize categorical features

In [None]:
for feature in selected_cat_features:
    data[feature] = pd.factorize(data[feature])[0]
data.head()

### Split data into train and test data set

In [None]:
train_df = data[data["train"] == 1]
test_df = data[data["train"] == 0]

train_df.drop(["train"], axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

### Save data to csv files

In [None]:
pd.DataFrame(train_df).to_csv("./Data/train_cleaned_v" + data_version + ".csv", index=False)
pd.DataFrame(test_df).to_csv("./Data/test_cleaned_v" + data_version + ".csv", index=False)