In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression,f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb

pd.options.display.max_columns = 100
pd.options.display.max_seq_items = 300
pd.options.display.max_rows = 300

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')
train_labels = pd.read_csv('train_labels.csv')
sample_submission = pd.read_csv('sample_submission.csv')

train.shape, test_features.shape, train_labels.shape, sample_submission.shape

((59400, 40), (14358, 40), (59400, 2), (14358, 2))

In [3]:
train['gps_height'].replace(0.0,np.nan,inplace=True)
test_features['gps_height'].replace(0.0,np.nan,inplace=True)
train['population'].replace(0.0,np.nan,inplace=True)
test_features['population'].replace(0.0,np.nan,inplace=True)
train['amount_tsh'].replace(0.0,np.nan,inplace=True)
test_features['amount_tsh'].replace(0.0,np.nan,inplace=True)
train['latitude'].replace(0.0,np.nan,inplace=True)
test_features['latitude'].replace(0.0,np.nan,inplace=True)
train['longitude'].replace(0.0,np.nan,inplace=True)
test_features['longitude'].replace(0.0,np.nan,inplace=True)
train['construction_year'].replace(0.0,np.nan,inplace=True)
test_features['construction_year'].replace(0.0,np.nan,inplace=True)

train.isna().sum()

id                           0
amount_tsh               41639
date_recorded                0
funder                    3635
gps_height               20438
installer                 3655
longitude                 1812
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population               21381
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year        20709
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [4]:
train['gps_height'].fillna(train.groupby(['region', 'district_code'])['gps_height'].transform('mean'), inplace=True)
train['gps_height'].fillna(train.groupby(['region'])['gps_height'].transform('mean'), inplace=True)
train['gps_height'].fillna(train['gps_height'].mean(), inplace=True)
train['population'].fillna(train.groupby(['region', 'district_code'])['population'].transform('median'), inplace=True)
train['population'].fillna(train.groupby(['region'])['population'].transform('median'), inplace=True)
train['population'].fillna(train['population'].median(), inplace=True)
train['amount_tsh'].fillna(train.groupby(['region', 'district_code'])['amount_tsh'].transform('median'), inplace=True)
train['amount_tsh'].fillna(train.groupby(['region'])['amount_tsh'].transform('median'), inplace=True)
train['amount_tsh'].fillna(train['amount_tsh'].median(), inplace=True)
train['latitude'].fillna(train.groupby(['region', 'district_code'])['latitude'].transform('mean'), inplace=True)
train['longitude'].fillna(train.groupby(['region', 'district_code'])['longitude'].transform('mean'), inplace=True)
train['longitude'].fillna(train.groupby(['region'])['longitude'].transform('mean'), inplace=True)
train['construction_year'].fillna(train.groupby(['region', 'district_code'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train.groupby(['region'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train.groupby(['district_code'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train['construction_year'].median(), inplace=True)

test_features['gps_height'].fillna(test_features.groupby(['region', 'district_code'])['gps_height'].transform('mean'), inplace=True)
test_features['gps_height'].fillna(test_features.groupby(['region'])['gps_height'].transform('mean'), inplace=True)
test_features['gps_height'].fillna(test_features['gps_height'].mean(), inplace=True)
test_features['population'].fillna(test_features.groupby(['region', 'district_code'])['population'].transform('median'), inplace=True)
test_features['population'].fillna(test_features.groupby(['region'])['population'].transform('median'), inplace=True)
test_features['population'].fillna(test_features['population'].median(), inplace=True)
test_features['amount_tsh'].fillna(test_features.groupby(['region', 'district_code'])['amount_tsh'].transform('median'), inplace=True)
test_features['amount_tsh'].fillna(test_features.groupby(['region'])['amount_tsh'].transform('median'), inplace=True)
test_features['amount_tsh'].fillna(test_features['amount_tsh'].median(), inplace=True)
test_features['latitude'].fillna(test_features.groupby(['region', 'district_code'])['latitude'].transform('mean'), inplace=True)
test_features['longitude'].fillna(test_features.groupby(['region', 'district_code'])['longitude'].transform('mean'), inplace=True)
test_features['longitude'].fillna(test_features.groupby(['region'])['longitude'].transform('mean'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['region', 'district_code'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['region'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['district_code'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features['construction_year'].median(), inplace=True)



In [5]:
train['age'] = (2019 - train['construction_year']).astype(int)
test_features['age'] = (2019 - test_features['construction_year']).astype(int)

train_days_since = np.array(train['date_recorded'].values, dtype='datetime64')
test_days_since = np.array(test_features['date_recorded'].values,dtype='datetime64')

train_birth = round(train['construction_year'])
test_birth = round(test_features['construction_year'])

train_years_since = np.datetime_as_string(train_days_since, unit='Y')
test_years_since = np.datetime_as_string(test_days_since, unit='Y')

train_years_since = train_years_since.astype(int)
test_years_since = test_years_since.astype(int)

in_train_years = []
in_test_years = []

for i in range(0,len(train_years_since)):
    x = train_years_since[i] - train_birth[i]
    in_train_years.append(x)
    

for i in range(0,len(test_years_since)):
    x = test_years_since[i] - test_birth[i]
    in_test_years.append(x)
    
train['years_until_record'] = in_train_years
test_features['years_until_record'] = in_test_years



In [6]:
train['funder'].fillna('other',inplace=True)
train['installer'].fillna('other',inplace=True)
train['subvillage'].fillna('other',inplace=True)
train['public_meeting'].fillna('other',inplace=True)
train['scheme_management'].fillna('other',inplace=True)
train['scheme_name'].fillna('other',inplace=True)
train['permit'].fillna('other',inplace=True)
test_features['funder'].fillna('other',inplace=True)
test_features['installer'].fillna('other',inplace=True)
test_features['subvillage'].fillna('other',inplace=True)
test_features['public_meeting'].fillna('other',inplace=True)
test_features['scheme_management'].fillna('other',inplace=True)
test_features['scheme_name'].fillna('other',inplace=True)
test_features['permit'].fillna('other',inplace=True)

In [7]:
train = pd.merge(train,train_labels)

In [8]:
def top_installers(x):
    
    unknown = ['0', 'unknown']
     
    government = ['government ', 'government', 'dwe', 'hesawa', 'rwe', 'central government', 'lga',
                 'district council', 'gover', 'gove', 'gov', 'district water department',
                 'sengerema water department', 'distri', 'centr', 'distric water department',
                 'tasaf']

    community = ['community', 'commu', 'villagers', 'twesa']
    
    
    religious = ['church of disciples', 'kkkt', 'world vision', 'rc church', 'rc', 'tcrs',
                'dmdd']
    
    international = ['norad', 'fini water', 'danida', 'danid', 'ces', 'kuwait',
                    'finw']
    
    private = ['private', 'privat', 'kiliwater', 'wedeco']
        
    aid = ['roman', 'amref', 'world bank', 'unicef', 'oxfam']

    if x in unknown:
        return 'unknown'
    
    if x in government:
        return 'government'

    if x in community:
        return 'community'

    if x in religious:
        return 'religious'
    
    if x in international:
        return 'international'

    if x in private:
        return 'private'

    if x in aid:
        return 'aid'
    
    return 'other'

def top_funders(x):
    
    unknown = ['0', 'unknown','no']
     
    government = ['government ', 'government', 'dwe', 'hesawa', 'rwe', 'central government', 'lga',
                 'district council', 'gover', 'gove', 'gov', 'district water department',
                 'sengerema water department', 'distri', 'centr', 'distric water department',
                 'tasaf', 'government of tanzania','ministry Of water','water','lawatefuka water supply']

    community = ['community', 'commu', 'villagers', 'twesa']
    
    
    religious = ['church of disciples', 'kkkt', 'world vision', 'rc church', 'rc', 'tcrs',
                'dmdd','mission','kkkkt_makwale']
    
    international = ['norad', 'fini water', 'danida', 'danid', 'ces', 'kuwait',
                    'finw','netherlands','germany republi','jaica','hifab','dwsp',
                    'amref','jica','shipo','nethalan', 'swedish']
    
    private = ['private', 'privat', 'kiliwater', 'wedeco']
        
    aid = ['roman', 'amref', 'world bank', 'unicef', 'oxfam','rwssp','wateraid',
           'rural water supply and sanitat','adb','oxfarm','dh','rc','go','concern world wide']

    if x in unknown:
        return 'unknown'
    
    if x in government:
        return 'government'

    if x in community:
        return 'community'

    if x in religious:
        return 'religious'
    
    if x in international:
        return 'international'

    if x in private:
        return 'private'

    if x in aid:
        return 'aid'
    
    return 'other'


In [9]:
train['funder'] = train['funder'].str.lower().apply(lambda x: top_funders(x))
train['installer'] = train['installer'].str.lower().apply(lambda x: top_installers(x))
test_features['funder'] = test_features['funder'].str.lower().apply(lambda x: top_funders(x))
test_features['installer'] = test_features['installer'].str.lower().apply(lambda x: top_installers(x))

In [10]:
train.describe(exclude=np.number)

Unnamed: 0,date_recorded,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
count,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400
unique,356,8,8,37400,9,19288,21,125,2092,3,1,13,2697,3,18,13,7,12,5,7,7,8,6,5,5,10,7,3,7,6,3
top,2011-03-15,other,government,none,Lake Victoria,Madukani,Iringa,Njombe,Igosi,True,GeoData Consultants Ltd,VWC,other,True,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
freq,572,24208,25907,3563,10248,508,5294,2503,307,51011,59400,36793,28166,38852,26780,26780,26780,40507,52490,25348,25348,50818,50818,33186,33186,17021,17021,45794,28522,34625,32259


In [11]:
drop_these=[
    'date_recorded',
    'wpt_name',
    'recorded_by',
    'lga',
    'ward',
    'num_private',
    'subvillage',
    'waterpoint_type_group',
    'extraction_type_group',
    'extraction_type_class',
    'payment',
    'quality_group',
    'quantity_group',
    'id',
    'source_type',
    'source_class',    
    'public_meeting',
    'scheme_name',
    'age',
    'region',
    'scheme_management',
    'management_group'
]
train.drop(columns=drop_these,inplace=True)
test_features.drop(columns=drop_these,inplace=True)

train.shape, test_features.shape

((59400, 21), (14358, 20))

In [12]:
train.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,region_code,district_code,population,construction_year,extraction_type,management,management_group,payment_type,water_quality,quantity,source,waterpoint_type,years_until_record,status_group
0,6000.0,aid,1390.0,aid,34.938093,-9.856322,Lake Nyasa,11,5,109.0,1999.0,gravity,vwc,user-group,annually,soft,enough,spring,communal standpipe,12.0,functional
1,200.0,other,1399.0,other,34.698766,-2.147466,Lake Victoria,20,2,280.0,2010.0,gravity,wug,user-group,never pay,soft,insufficient,rainwater harvesting,communal standpipe,3.0,functional
2,25.0,other,686.0,religious,37.460664,-3.821329,Pangani,21,4,250.0,2009.0,gravity,vwc,user-group,per bucket,soft,enough,dam,communal standpipe multiple,4.0,functional
3,50.0,aid,263.0,aid,38.486161,-11.155298,Ruvuma / Southern Coast,90,63,58.0,1986.0,submersible,vwc,user-group,never pay,soft,dry,machine dbh,communal standpipe multiple,27.0,non functional
4,250.0,other,1057.545585,other,31.130847,-1.825359,Lake Victoria,18,1,200.0,2003.0,gravity,other,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,8.0,functional


# OKAY so i'm going to go ahead and attempt to stack my best scores on the public LB for Stage 1 stacking

### submission-016.csv (dart)
### submission-021.csv (gbrt)
### submission-022c.csv (rf)

In [13]:
train.drop(columns='status_group',inplace=True)

In [15]:
oe = ce.OrdinalEncoder(verbose=1,mapping=None,cols=None,drop_invariant=True,
                       return_df=True, impute_missing=True,handle_unknown='ignore')

train = oe.fit_transform(train)
test_features = oe.fit_transform(test_features)

In [16]:
model=RandomForestClassifier(n_estimators=1000)
scaler=RobustScaler()

X_train = scaler.fit_transform(train)
X_test = scaler.fit_transform(test_features)
y_train = train_labels['status_group']

model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
model.score(X_train,y_train)

0.9966498316498317

In [19]:
submission=sample_submission.copy()
submission['status_group'] = model.predict(X_test)
submission.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [18]:
submission.to_csv('submission-024.csv',index=False)

### per Mac's suggestion, just doing a simple n_estimators=1000 gets me a better score than anything lol...

In [40]:
scaler=RobustScaler()
X_train = scaler.fit_transform(train)
X_test = scaler.fit_transform(test_features)
y_train = train_labels['status_group']

params = {'boosting_type': 'rf',
          'max_depth' : -1,
          'num_threads': -1,
          'objective': 'multiclass',
          'num_leaves': 1000,
          'num_iterations': 6000,
          'learning_rate': 0.001,
          'max_bin': 1042,
          'subsample_for_bin': 200000,
          'subsample': 0.5,
          'subsample_freq': 2,
          'colsample_bytree': 1,
          'min_split_gain': 0.5,
          'min_child_weight': 5,
          'min_child_samples': 50,
          'num_class' : 3,
          'metric' : 'multi_logloss'}

model = lgb.LGBMClassifier(
    boosting_type = params['boosting_type'],
    num_leaves = params['num_leaves'],
    objective = params['objective'],
    num_threads = params['num_threads'],
    max_depth = params['max_depth'],
    max_bin = params['max_bin'],
    colsample_bytree = params['colsample_bytree'],
    subsample_for_bin = params['subsample_for_bin'],
    subsample = params['subsample'],
    subsample_freq = params['subsample_freq'],
    min_split_gain = params['min_split_gain'],
    min_child_weight = params['min_child_weight'],
    min_child_samples = params['min_child_samples'],
    num_class = params['num_class'],
    metric = params['metric']
)

# param_grid = {
#     'num_leaves': [500,1000],
#     'reg_alpha': [1,3],
#     'reg_lambda': [3,5],
# }

# gridsearch = GridSearchCV(model,param_grid=param_grid,cv=5,
#                           verbose=0, n_jobs=-1, scoring='roc_auc')

model.fit(X_train,y_train)

submission=sample_submission.copy()
submission['status_group'] = model.predict(X_test)
submission.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,non functional
3,45559,non functional
4,49871,functional


In [41]:
submission.to_csv('submission-026c.csv',index=False)

In [42]:
s24 = pd.read_csv('submission-024.csv')
s25 = pd.read_csv('submission-025.csv')
s26 = pd.read_csv('submission-026.csv')
s26a = pd.read_csv('submission-026a.csv')
s26b = pd.read_csv('submission-026b.csv')
s26c = pd.read_csv('submission-026c.csv')

#rf
print(s24['status_group'].value_counts())
#dart
print(s25['status_group'].value_counts())
#gbrt
print(s26['status_group'].value_counts())
#gbrt larger leaves + iters
print(s26a['status_group'].value_counts())
#gbrt smaller leaves + iters
print(s26b['status_group'].value_counts())
#rf lots of leaves lots of iters
print(s26c['status_group'].value_counts())

functional                 8667
non functional             5263
functional needs repair     428
Name: status_group, dtype: int64
functional                 9278
non functional             4869
functional needs repair     211
Name: status_group, dtype: int64
functional                 8925
non functional             5084
functional needs repair     349
Name: status_group, dtype: int64
functional                 8978
non functional             5060
functional needs repair     320
Name: status_group, dtype: int64
functional                 8978
non functional             5060
functional needs repair     320
Name: status_group, dtype: int64
functional                 9573
non functional             4553
functional needs repair     232
Name: status_group, dtype: int64
