In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error,accuracy_score
from sklearn.metrics import roc_curve, auc, recall_score, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel,SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression,f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import sklearn
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

pd.options.display.max_columns = 100
pd.options.display.max_seq_items = 300
pd.options.display.max_rows = 300

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')
train_labels = pd.read_csv('train_labels.csv')
sample_submission = pd.read_csv('sample_submission.csv')

train.shape, test_features.shape, train_labels.shape, sample_submission.shape

((59400, 40), (14358, 40), (59400, 2), (14358, 2))

In [3]:
train['gps_height'].replace(0.0,np.nan,inplace=True)
test_features['gps_height'].replace(0.0,np.nan,inplace=True)
train['population'].replace(0.0,np.nan,inplace=True)
test_features['population'].replace(0.0,np.nan,inplace=True)
train['amount_tsh'].replace(0.0,np.nan,inplace=True)
test_features['amount_tsh'].replace(0.0,np.nan,inplace=True)
train['latitude'].replace(0.0,np.nan,inplace=True)
test_features['latitude'].replace(0.0,np.nan,inplace=True)
train['longitude'].replace(0.0,np.nan,inplace=True)
test_features['longitude'].replace(0.0,np.nan,inplace=True)
train['construction_year'].replace(0.0,np.nan,inplace=True)
test_features['construction_year'].replace(0.0,np.nan,inplace=True)

In [4]:
train['construction_year'].fillna(train.groupby(['region', 'district_code'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train.groupby(['region'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train.groupby(['district_code'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train['construction_year'].median(), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['region', 'district_code'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['region'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['district_code'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features['construction_year'].median(), inplace=True)


train['age'] = (2019 - train['construction_year']).astype(int)
test_features['age'] = (2019 - test_features['construction_year']).astype(int)

train_days_since = np.array(train['date_recorded'].values, dtype='datetime64')
test_days_since = np.array(test_features['date_recorded'].values,dtype='datetime64')

train_birth = round(train['construction_year'])
test_birth = round(test_features['construction_year'])

train_years_since = np.datetime_as_string(train_days_since, unit='Y')
test_years_since = np.datetime_as_string(test_days_since, unit='Y')

train_years_since = train_years_since.astype(int)
test_years_since = test_years_since.astype(int)

in_train_years = []
in_test_years = []

for i in range(0,len(train_years_since)):
    x = train_years_since[i] - train_birth[i]
    in_train_years.append(x)
    

for i in range(0,len(test_years_since)):
    x = test_years_since[i] - test_birth[i]
    in_test_years.append(x)
    
train['years_until_record'] = in_train_years
test_features['years_until_record'] = in_test_years

train_year_record = []
test_year_record = []

for i in range(0,len(train['construction_year'])):
    x = train['construction_year'][i] + train['years_until_record'][i]
    train_year_record.append(x)
    
for i in range(0,len(test_features['construction_year'])):    
    x = test_features['construction_year'][i] + test_features['years_until_record'][i]
    test_year_record.append(x)
    
train['year_recorded'] = train_year_record
test_features['year_recorded'] = test_year_record

train['month'] = train['date_recorded'].apply(lambda x: int(x.split('-')[1]))
test_features['month'] = test_features['date_recorded'].apply(lambda x: int(x.split('-')[1]))

# https://www.expertafrica.com/tanzania/weather-and-climate

def them_seasons(month):
    if month in [4,5]:
        return 'heavy rain'
    elif month in [10, 1, 2]:
        return 'humid dry'
    elif month == [3,11,12]:
        return 'sporadic rain'
    elif month in [6,7,8,9]:
        return 'cool dry'
    else:
        return 'some rain'

train['season'] = train['month'].apply(them_seasons)
test_features['season'] = test_features['month'].apply(them_seasons)

train['region_code'] = train['region_code'].astype(str)
test_features['region_code'] = test_features['region_code'].astype(str)

train['district_code'] = train['district_code'].astype(str)
test_features['district_code'] = test_features['district_code'].astype(str)

train['population_isnull'] = train['population'] == 0
train['subvillage_isnull'] = train['subvillage'].isnull()
train['permit_isnull'] = train['permit'].isnull()

test_features['population_isnull'] = test_features['population'] == 0
test_features['subvillage_isnull'] = test_features['subvillage'].isnull()
test_features['permit_isnull'] = test_features['permit'].isnull()

train['subvillage'] = train['subvillage'].str.lower()
most_sub = train['subvillage'].str.lower().value_counts().index[:30]
train['lga'] = train['lga'].str.lower()
most_lga = train['lga'].str.lower().value_counts().index[:30]
train['wpt_name'] = train['wpt_name'].str.lower()
most_wpt = train['wpt_name'].str.lower().value_counts().index[:30]

train['subvillage'] = ['other' if sub not in most_sub else sub for sub in 
                       train['subvillage'].str.lower()]

train['lga'] = ['other' if i not in most_lga else i for i in 
                       train['lga'].str.lower()]

train['wpt_name'] = ['other' if wpt not in most_wpt else wpt for wpt in 
                       train['wpt_name'].str.lower()]

train['is_rural'] = [lga.find('rural') != -1 for lga in train['lga']]
train['is_urban'] = [lga.find('urban') != -1 for lga in train['lga']]

test_features['subvillage'] = test_features['subvillage'].str.lower()
most_sub = test_features['subvillage'].str.lower().value_counts().index[:30]
test_features['lga'] = test_features['lga'].str.lower()
most_lga = test_features['lga'].str.lower().value_counts().index[:30]
test_features['wpt_name'] = test_features['wpt_name'].str.lower()
most_wpt = test_features['wpt_name'].str.lower().value_counts().index[:30]

test_features['subvillage'] = ['other' if sub not in most_sub else sub for sub in 
                       test_features['subvillage'].str.lower()]

test_features['lga'] = ['other' if i not in most_lga else i for i in 
                       test_features['lga'].str.lower()]

test_features['wpt_name'] = ['other' if wpt not in most_wpt else wpt for wpt in 
                       test_features['wpt_name'].str.lower()]

test_features['is_rural'] = [lga.find('rural') != -1 for lga in test_features['lga']]
test_features['is_urban'] = [lga.find('urban') != -1 for lga in test_features['lga']]

def top_installers(x):
    
    unknown = ['0', 'unknown'] 
    government = ['government ', 'government', 'dwe', 'hesawa', 'rwe', 'central government', 'lga',
                 'district council', 'gover', 'gove', 'gov', 'district water department',
                 'sengerema water department', 'distri', 'centr', 'distric water department',
                 'tasaf']
    community = ['community', 'commu', 'villagers', 'twesa']    
    religious = ['church of disciples', 'kkkt', 'world vision', 'rc church', 'rc', 'tcrs',
                'dmdd'] 
    international = ['norad', 'fini water', 'danida', 'danid', 'ces', 'kuwait',
                    'finw']  
    private = ['private', 'privat', 'kiliwater', 'wedeco']     
    aid = ['roman', 'amref', 'world bank', 'unicef', 'oxfam']
    
    if x in unknown:
        return 'unknown'
    
    if x in government:
        return 'government'

    if x in community:
        return 'community'

    if x in religious:
        return 'religious'
    
    if x in international:
        return 'international'

    if x in private:
        return 'private'

    if x in aid:
        return 'aid'
    
    return 'other'

def top_funders(x):
    
    unknown = ['0', 'unknown','no']  
    government = ['government ', 'government', 'dwe', 'hesawa', 'rwe', 'central government', 'lga',
                 'district council', 'gover', 'gove', 'gov', 'district water department',
                 'sengerema water department', 'distri', 'centr', 'distric water department',
                 'tasaf', 'government of tanzania','ministry Of water','water','lawatefuka water supply']
    community = ['community', 'commu', 'villagers', 'twesa']
    religious = ['church of disciples', 'kkkt', 'world vision', 'rc church', 'rc', 'tcrs',
                'dmdd','mission','kkkkt_makwale']
    international = ['norad', 'fini water', 'danida', 'danid', 'ces', 'kuwait',
                    'finw','netherlands','germany republi','jaica','hifab','dwsp',
                    'amref','jica','shipo','nethalan', 'swedish']
    private = ['private', 'privat', 'kiliwater', 'wedeco']
    aid = ['roman', 'amref', 'world bank', 'unicef', 'oxfam','rwssp','wateraid',
           'rural water supply and sanitat','adb','oxfarm','dh','rc','go','concern world wide']
    if x in unknown:
        return 'unknown'
    
    if x in government:
        return 'government'

    if x in community:
        return 'community'

    if x in religious:
        return 'religious'
    
    if x in international:
        return 'international'

    if x in private:
        return 'private'

    if x in aid:
        return 'aid'
    
    return 'other'

train['funder'] = train['funder'].str.lower().apply(lambda x: top_funders(x))
train['installer'] = train['installer'].str.lower().apply(lambda x: top_installers(x))
test_features['funder'] = test_features['funder'].str.lower().apply(lambda x: top_funders(x))
test_features['installer'] = test_features['installer'].str.lower().apply(lambda x: top_installers(x))

In [5]:
train['gps_height'].fillna(train.groupby(['region', 'district_code'])['gps_height'].transform('mean'), inplace=True)
train['gps_height'].fillna(train.groupby(['region'])['gps_height'].transform('mean'), inplace=True)
train['gps_height'].fillna(train['gps_height'].mean(), inplace=True)
train['population'].fillna(train.groupby(['region', 'district_code'])['population'].transform('median'), inplace=True)
train['population'].fillna(train.groupby(['region'])['population'].transform('median'), inplace=True)
train['population'].fillna(train['population'].median(), inplace=True)
train['amount_tsh'].fillna(train.groupby(['region', 'district_code'])['amount_tsh'].transform('median'), inplace=True)
train['amount_tsh'].fillna(train.groupby(['region'])['amount_tsh'].transform('median'), inplace=True)
train['amount_tsh'].fillna(train['amount_tsh'].median(), inplace=True)
train['latitude'].fillna(train.groupby(['region', 'district_code'])['latitude'].transform('mean'), inplace=True)
train['longitude'].fillna(train.groupby(['region', 'district_code'])['longitude'].transform('mean'), inplace=True)
train['longitude'].fillna(train.groupby(['region'])['longitude'].transform('mean'), inplace=True)

test_features['gps_height'].fillna(test_features.groupby(['region', 'district_code'])['gps_height'].transform('mean'), inplace=True)
test_features['gps_height'].fillna(test_features.groupby(['region'])['gps_height'].transform('mean'), inplace=True)
test_features['gps_height'].fillna(test_features['gps_height'].mean(), inplace=True)
test_features['population'].fillna(test_features.groupby(['region', 'district_code'])['population'].transform('median'), inplace=True)
test_features['population'].fillna(test_features.groupby(['region'])['population'].transform('median'), inplace=True)
test_features['population'].fillna(test_features['population'].median(), inplace=True)
test_features['amount_tsh'].fillna(test_features.groupby(['region', 'district_code'])['amount_tsh'].transform('median'), inplace=True)
test_features['amount_tsh'].fillna(test_features.groupby(['region'])['amount_tsh'].transform('median'), inplace=True)
test_features['amount_tsh'].fillna(test_features['amount_tsh'].median(), inplace=True)
test_features['latitude'].fillna(test_features.groupby(['region', 'district_code'])['latitude'].transform('mean'), inplace=True)
test_features['longitude'].fillna(test_features.groupby(['region', 'district_code'])['longitude'].transform('mean'), inplace=True)
test_features['longitude'].fillna(test_features.groupby(['region'])['longitude'].transform('mean'), inplace=True)

train['funder'].fillna('other',inplace=True)
train['installer'].fillna('other',inplace=True)
train['subvillage'].fillna('other',inplace=True)
train['public_meeting'].fillna(False,inplace=True)
train['scheme_management'].fillna('None',inplace=True)
train['scheme_name'].fillna('None',inplace=True)
train['permit'].fillna(False,inplace=True)
test_features['funder'].fillna('other',inplace=True)
test_features['installer'].fillna('other',inplace=True)
test_features['subvillage'].fillna('other',inplace=True)
test_features['public_meeting'].fillna(False,inplace=True)
test_features['scheme_management'].fillna('None',inplace=True)
test_features['scheme_name'].fillna('None',inplace=True)
test_features['permit'].fillna(False,inplace=True)

In [None]:
drop_these=[
    'date_recorded',
    'recorded_by',
    'ward',
    'num_private',
    'waterpoint_type_group',
    'extraction_type_group',
    'extraction_type_class',
    'payment_type',
    'quality_group',
    'quantity_group',
    'id',
    'source_type',
    'source_class',    
    'public_meeting',
    'scheme_name',
    'region',
    'management_group'
]
train.drop(columns=drop_these,inplace=True)
test_features.drop(columns=drop_these,inplace=True)

train.shape, test_features.shape

((59400, 33), (14358, 33))

In [None]:
X_train = train.copy()
X_test = test_features.copy()
y_train = train_labels['status_group']

ohe = ce.OneHotEncoder(use_cat_names=True)
ohe.fit(X_train,y_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

In [None]:
X_train.shape, X_test.shape

((59400, 281), (14358, 281))

In [None]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train.values)

In [None]:
X_train,X_val,y_train,y_val = train_test_split(
    X_train, y_train_le, test_size = 0.2, stratify=y_train_le,shuffle=True,
    random_state= 369
)

train = xgb.DMatrix(X_train,label=y_train)
val = xgb.DMatrix(X_val,label=y_val)
test = xgb.DMatrix(X_test)

In [None]:
param_grid = {   
                 'num_class' : 3,
                 'scale_pos_weight' : 1,
                 'max_depth': 80, # very deep tree
                 'eta': 0.1,
                 'n_thread' : 8,
                 'colsample_bytree' : 0.5,
                 'subsample' : 0.3,
                 'silent': 1, 
                 'n_estimators' : 4000, # lots of estimators
                 'reg_alpha' : 0.3,
                 'gamma' : 1,
                 'objective': 'multi:softprob',
                 'eval_metric' : 'merror'
             } 

num_rounds = 200

evals = [(train, 'train'), (val, 'validation')]

boost = xgb.train(param_grid, train, num_rounds, evals, early_stopping_rounds=25)
boost.save_model('xgboost_model_1.model')

[0]	train-merror:0.213152	validation-merror:0.239057
Multiple eval metrics have been passed: 'validation-merror' will be used for early stopping.

Will train until validation-merror hasn't improved in 25 rounds.
[1]	train-merror:0.194613	validation-merror:0.226094
[2]	train-merror:0.186406	validation-merror:0.218687
[3]	train-merror:0.183249	validation-merror:0.212542
[4]	train-merror:0.180492	validation-merror:0.211111
[5]	train-merror:0.180429	validation-merror:0.210438
[6]	train-merror:0.177862	validation-merror:0.207323
[7]	train-merror:0.173569	validation-merror:0.206145
[8]	train-merror:0.173695	validation-merror:0.204966
[9]	train-merror:0.171928	validation-merror:0.206229
[10]	train-merror:0.170581	validation-merror:0.204798
[11]	train-merror:0.170013	validation-merror:0.205219
[12]	train-merror:0.16896	validation-merror:0.201936
[13]	train-merror:0.167003	validation-merror:0.201263
[14]	train-merror:0.165383	validation-merror:0.200253
[15]	train-merror:0.16452	validation-merro