In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression,f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb

pd.options.display.max_columns = 100

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')
train_labels = pd.read_csv('train_labels.csv')
sample_submission = pd.read_csv('sample_submission.csv')

train, test_features.shape, train_labels.shape, sample_submission.shape

(          id  amount_tsh date_recorded                    funder  gps_height  \
 0      69572      6000.0    2011-03-14                     Roman        1390   
 1       8776         0.0    2013-03-06                   Grumeti        1399   
 2      34310        25.0    2013-02-25              Lottery Club         686   
 3      67743         0.0    2013-01-28                    Unicef         263   
 4      19728         0.0    2011-07-13               Action In A           0   
 5       9944        20.0    2011-03-13       Mkinga Distric Coun           0   
 6      19816         0.0    2012-10-01                      Dwsp           0   
 7      54551         0.0    2012-10-09                     Rwssp           0   
 8      53934         0.0    2012-11-03                  Wateraid           0   
 9      46144         0.0    2011-08-03               Isingiro Ho           0   
 10     49056         0.0    2011-02-20                   Private          62   
 11     50409       200.0   

In [3]:
train['gps_height'].replace(0.0,np.nan,inplace=True)
test_features['gps_height'].replace(0.0,np.nan,inplace=True)
train['population'].replace(0.0,np.nan,inplace=True)
test_features['population'].replace(0.0,np.nan,inplace=True)
train['amount_tsh'].replace(0.0,np.nan,inplace=True)
test_features['amount_tsh'].replace(0.0,np.nan,inplace=True)
train['latitude'].replace(0.0,np.nan,inplace=True)
test_features['latitude'].replace(0.0,np.nan,inplace=True)
train['longitude'].replace(0.0,np.nan,inplace=True)
test_features['longitude'].replace(0.0,np.nan,inplace=True)
train['construction_year'].replace(0.0,np.nan,inplace=True)
test_features['construction_year'].replace(0.0,np.nan,inplace=True)

train.isna().sum()

id                           0
amount_tsh               41639
date_recorded                0
funder                    3635
gps_height               20438
installer                 3655
longitude                 1812
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population               21381
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year        20709
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [4]:
train['gps_height'].fillna(train.groupby(['region', 'district_code'])['gps_height'].transform('mean'), inplace=True)
train['gps_height'].fillna(train.groupby(['region'])['gps_height'].transform('mean'), inplace=True)
train['gps_height'].fillna(train['gps_height'].mean(), inplace=True)
train['population'].fillna(train.groupby(['region', 'district_code'])['population'].transform('median'), inplace=True)
train['population'].fillna(train.groupby(['region'])['population'].transform('median'), inplace=True)
train['population'].fillna(train['population'].median(), inplace=True)
train['amount_tsh'].fillna(train.groupby(['region', 'district_code'])['amount_tsh'].transform('median'), inplace=True)
train['amount_tsh'].fillna(train.groupby(['region'])['amount_tsh'].transform('median'), inplace=True)
train['amount_tsh'].fillna(train['amount_tsh'].median(), inplace=True)
train['latitude'].fillna(train.groupby(['region', 'district_code'])['latitude'].transform('mean'), inplace=True)
train['longitude'].fillna(train.groupby(['region', 'district_code'])['longitude'].transform('mean'), inplace=True)
train['longitude'].fillna(train.groupby(['region'])['longitude'].transform('mean'), inplace=True)
train['construction_year'].fillna(train.groupby(['region', 'district_code'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train.groupby(['region'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train.groupby(['district_code'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train['construction_year'].median(), inplace=True)

test_features['gps_height'].fillna(test_features.groupby(['region', 'district_code'])['gps_height'].transform('mean'), inplace=True)
test_features['gps_height'].fillna(test_features.groupby(['region'])['gps_height'].transform('mean'), inplace=True)
test_features['gps_height'].fillna(test_features['gps_height'].mean(), inplace=True)
test_features['population'].fillna(test_features.groupby(['region', 'district_code'])['population'].transform('median'), inplace=True)
test_features['population'].fillna(test_features.groupby(['region'])['population'].transform('median'), inplace=True)
test_features['population'].fillna(test_features['population'].median(), inplace=True)
test_features['amount_tsh'].fillna(test_features.groupby(['region', 'district_code'])['amount_tsh'].transform('median'), inplace=True)
test_features['amount_tsh'].fillna(test_features.groupby(['region'])['amount_tsh'].transform('median'), inplace=True)
test_features['amount_tsh'].fillna(test_features['amount_tsh'].median(), inplace=True)
test_features['latitude'].fillna(test_features.groupby(['region', 'district_code'])['latitude'].transform('mean'), inplace=True)
test_features['longitude'].fillna(test_features.groupby(['region', 'district_code'])['longitude'].transform('mean'), inplace=True)
test_features['longitude'].fillna(test_features.groupby(['region'])['longitude'].transform('mean'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['region', 'district_code'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['region'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['district_code'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features['construction_year'].median(), inplace=True)



In [5]:
train['age'] = (2019 - train['construction_year']).astype(int)
test_features['age'] = (2019 - test_features['construction_year']).astype(int)

train_days_since = np.array(train['date_recorded'].values, dtype='datetime64')
test_days_since = np.array(test_features['date_recorded'].values,dtype='datetime64')

train_birth = round(train['construction_year'])
test_birth = round(test_features['construction_year'])

train_years_since = np.datetime_as_string(train_days_since, unit='Y')
test_years_since = np.datetime_as_string(test_days_since, unit='Y')

train_years_since = train_years_since.astype(int)
test_years_since = test_years_since.astype(int)

in_train_years = []
in_test_years = []

for i in range(0,len(train_years_since)):
    x = train_years_since[i] - train_birth[i]
    in_train_years.append(x)
    

for i in range(0,len(test_years_since)):
    x = test_years_since[i] - test_birth[i]
    in_test_years.append(x)
    
train['years_until_record'] = in_train_years
test_features['years_until_record'] = in_test_years



In [6]:
train['funder'].fillna('other',inplace=True)
train['installer'].fillna('other',inplace=True)
train['subvillage'].fillna('other',inplace=True)
train['public_meeting'].fillna('other',inplace=True)
train['scheme_management'].fillna('other',inplace=True)
train['scheme_name'].fillna('other',inplace=True)
train['permit'].fillna('other',inplace=True)
test_features['funder'].fillna('other',inplace=True)
test_features['installer'].fillna('other',inplace=True)
test_features['subvillage'].fillna('other',inplace=True)
test_features['public_meeting'].fillna('other',inplace=True)
test_features['scheme_management'].fillna('other',inplace=True)
test_features['scheme_name'].fillna('other',inplace=True)
test_features['permit'].fillna('other',inplace=True)

In [7]:
#functions for feature replacement
def top_funders(df):  
    if df['funder']=='Government Of Tanzania':
        return 'gov'
    elif df['funder']=='Danida':
        return 'danida'
    elif df['funder']=='Hesawa':
        return 'hesawa'
    elif df['funder']=='Rwssp':
        return 'rwssp'
    elif df['funder']=='World Bank':
        return 'world_bank'    
    elif df['funder']=='Kkkt':
        return 'Kkkt'
    elif df['funder']=='World Vision':
        return 'World Vision'
    elif df['funder']=='Unicef':
        return 'Unicef'
    elif df['funder']=='Tasaf':
        return 'Tasaf'
    elif df['funder']=='District Council':
        return 'District Council'
    elif df['funder']=='Dhv':
        return 'DHV_Engineering'
    elif df['funder']=='Norad':
        return 'NORAD'
    elif df['funder']=='Private Individual':
        return 'Private'
    elif df['funder']=='Dwsp':
        return 'DWSP'
    elif df['funder']=='Germany Republi':
        return 'Germany'
    elif df['funder']=='Ministry of Water':
        return 'Ministry of Water'
    elif df['funder']=='Water':
        return 'Ministry of Water'
    elif df['funder']=='Lga':
        return 'lga'
    else:
        return 'other'
    
def top_installers(df):
    if df['installer']=='DWE':
        return 'dwe'
    elif df['installer']=='Government':
        return 'gov'
    elif df['installer']=='RWE':
        return 'rwe'
    elif df['installer']=='Commu':
        return 'commu'
    elif df['installer']=='DANIDA':
        return 'danida'
    elif df['installer']=='KKKT':
        return 'kkkt'
    elif df['installer']=='Hesawa':
        return 'hesawa'
    elif df['installer']=='TCRS':
        return 'tcrs'
    elif df['installer']=='Central government':
        return 'Central government'
    elif df['installer']=='CES':
        return 'CES'
    elif df['installer']=='Community':
        return 'commu'
    elif df['installer']=='HESAWA':
        return 'hesawa'
    elif df['installer']=='District Council':
        return 'district_council'
    elif df['installer']=='World vision':
        return 'world_vision'
    elif df['installer']=='TASAF':
        return 'tasaf'
    elif df['installer']=='LGA':
        return 'lga'
    else:
        return 'other' 


In [8]:
train['funder'] = train.apply(lambda x: top_funders(x),axis=1)
train['installer'] = train.apply(lambda x: top_installers(x),axis=1)
test_features['funder'] = test_features.apply(lambda x: top_funders(x),axis=1)
test_features['installer'] = test_features.apply(lambda x: top_installers(x),axis=1)

In [9]:
drop_these=[
    'date_recorded',
    'wpt_name',
    'recorded_by',
    'lga',
    'ward',
    'num_private',
    'subvillage',
    'waterpoint_type_group',
    'extraction_type_group',
    'extraction_type_class',
    'management',
    'payment',
    'quantity_group',
    'id',
    'source',
    'permit',
    'public_meeting',
    'scheme_name'
]
train.drop(columns=drop_these,inplace=True)
test_features.drop(columns=drop_these,inplace=True)

train.shape, test_features.shape

((59400, 24), (14358, 24))

# OKAY so i'm going to go ahead and attempt to stack my best scores on the public LB for Stage 1 stacking

### submission-016.csv (dart)
### submission-021.csv (gbrt)
### submission-022c.csv (rf)