In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression,f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb

pd.options.display.max_columns = 100

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')
train_labels = pd.read_csv('train_labels.csv')
sample_submission = pd.read_csv('sample_submission.csv')

train_features.shape, test_features.shape, train_labels.shape, sample_submission.shape

((59400, 40), (14358, 40), (59400, 2), (14358, 2))

In [3]:
train = train_features.copy()

In [4]:
train.describe(exclude=np.number)

Unnamed: 0,date_recorded,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
count,59400,55765,55745,59400,59400,59029,59400,59400,59400,56066,59400,55523,31234,56344,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400
unique,356,1897,2145,37400,9,19287,21,125,2092,2,1,12,2696,2,18,13,7,12,5,7,7,8,6,5,5,10,7,3,7,6
top,2011-03-15,Government Of Tanzania,DWE,none,Lake Victoria,Madukani,Iringa,Njombe,Igosi,True,GeoData Consultants Ltd,VWC,K,True,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
freq,572,9084,17402,3563,10248,508,5294,2503,307,51011,59400,36793,682,38852,26780,26780,26780,40507,52490,25348,25348,50818,50818,33186,33186,17021,17021,45794,28522,34625


In [5]:
test_features.describe(exclude=np.number)

Unnamed: 0,date_recorded,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
count,14358,13575,13570,14358,14358,14264,14358,14358,14358,13573,14358,13419,7519,13695,14358,14358,14358,14358,14358,14358,14358,14358,14358,14358,14358,14358,14358,14358,14358,14358
unique,331,960,1075,10615,9,8253,21,124,1934,2,1,11,1772,2,17,13,7,12,5,7,7,8,6,5,5,10,7,3,7,6
top,2011-03-16,Government Of Tanzania,DWE,none,Lake Victoria,Shuleni,Shinyanga,Njombe,Igosi,True,GeoData Consultants Ltd,VWC,Borehole,True,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,communal standpipe,communal standpipe
freq,137,2117,4162,822,2535,136,1258,611,79,12308,14358,8807,158,9442,6168,6168,6168,9780,12639,6098,6098,12237,12237,7997,7997,4211,4211,11127,6790,8260


In [6]:
train['gps_height'].replace(0.0,np.nan,inplace=True)
test_features['gps_height'].replace(0.0,np.nan,inplace=True)
train['population'].replace(0.0,np.nan,inplace=True)
test_features['population'].replace(0.0,np.nan,inplace=True)
train['amount_tsh'].replace(0.0,np.nan,inplace=True)
test_features['amount_tsh'].replace(0.0,np.nan,inplace=True)
train['latitude'].replace(0.0,np.nan,inplace=True)
test_features['latitude'].replace(0.0,np.nan,inplace=True)
train['longitude'].replace(0.0,np.nan,inplace=True)
test_features['longitude'].replace(0.0,np.nan,inplace=True)
train['construction_year'].replace(0.0,np.nan,inplace=True)
test_features['construction_year'].replace(0.0,np.nan,inplace=True)

train.isna().sum()

id                           0
amount_tsh               41639
date_recorded                0
funder                    3635
gps_height               20438
installer                 3655
longitude                 1812
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population               21381
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year        20709
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [7]:
train['gps_height'].fillna(train.groupby(['region', 'district_code'])['gps_height'].transform('mean'), inplace=True)
train['gps_height'].fillna(train.groupby(['region'])['gps_height'].transform('mean'), inplace=True)
train['gps_height'].fillna(train['gps_height'].mean(), inplace=True)
train['population'].fillna(train.groupby(['region', 'district_code'])['population'].transform('median'), inplace=True)
train['population'].fillna(train.groupby(['region'])['population'].transform('median'), inplace=True)
train['population'].fillna(train['population'].median(), inplace=True)
train['amount_tsh'].fillna(train.groupby(['region', 'district_code'])['amount_tsh'].transform('median'), inplace=True)
train['amount_tsh'].fillna(train.groupby(['region'])['amount_tsh'].transform('median'), inplace=True)
train['amount_tsh'].fillna(train['amount_tsh'].median(), inplace=True)
train['latitude'].fillna(train.groupby(['region', 'district_code'])['latitude'].transform('mean'), inplace=True)
train['longitude'].fillna(train.groupby(['region', 'district_code'])['longitude'].transform('mean'), inplace=True)
train['longitude'].fillna(train.groupby(['region'])['longitude'].transform('mean'), inplace=True)
train['construction_year'].fillna(train.groupby(['region', 'district_code'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train.groupby(['region'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train.groupby(['district_code'])['construction_year'].transform('median'), inplace=True)
train['construction_year'].fillna(train['construction_year'].median(), inplace=True)

test_features['gps_height'].fillna(test_features.groupby(['region', 'district_code'])['gps_height'].transform('mean'), inplace=True)
test_features['gps_height'].fillna(test_features.groupby(['region'])['gps_height'].transform('mean'), inplace=True)
test_features['gps_height'].fillna(test_features['gps_height'].mean(), inplace=True)
test_features['population'].fillna(test_features.groupby(['region', 'district_code'])['population'].transform('median'), inplace=True)
test_features['population'].fillna(test_features.groupby(['region'])['population'].transform('median'), inplace=True)
test_features['population'].fillna(test_features['population'].median(), inplace=True)
test_features['amount_tsh'].fillna(test_features.groupby(['region', 'district_code'])['amount_tsh'].transform('median'), inplace=True)
test_features['amount_tsh'].fillna(test_features.groupby(['region'])['amount_tsh'].transform('median'), inplace=True)
test_features['amount_tsh'].fillna(test_features['amount_tsh'].median(), inplace=True)
test_features['latitude'].fillna(test_features.groupby(['region', 'district_code'])['latitude'].transform('mean'), inplace=True)
test_features['longitude'].fillna(test_features.groupby(['region', 'district_code'])['longitude'].transform('mean'), inplace=True)
test_features['longitude'].fillna(test_features.groupby(['region'])['longitude'].transform('mean'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['region', 'district_code'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['region'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features.groupby(['district_code'])['construction_year'].transform('median'), inplace=True)
test_features['construction_year'].fillna(test_features['construction_year'].median(), inplace=True)



In [8]:
print(np.min(train['amount_tsh']))
print(np.max(train['amount_tsh']))
print(np.min(train['gps_height']))
print(np.max(train['gps_height']))
print(np.min(train['population']))
print(np.max(train['population']))

0.2
350000.0
-90.0
2770.0
1.0
30500.0


In [9]:
print(np.min(test_features['amount_tsh']))
print(np.max(test_features['amount_tsh']))
print(np.min(test_features['gps_height']))
print(np.max(test_features['gps_height']))
print(np.min(test_features['population']))
print(np.max(test_features['population']))

0.2
200000.0
-57.0
2777.0
1.0
11469.0


In [10]:
train_tsh_scaler = MinMaxScaler(feature_range=(0,200))
train_gps_scaler = MinMaxScaler(feature_range=(0,3))
train_pop_scaler = MinMaxScaler(feature_range=(0,30))
test_tsh_scaler = MinMaxScaler(feature_range=(0,200))
test_gps_scaler = MinMaxScaler(feature_range=(0,3))
test_pop_scaler = MinMaxScaler(feature_range=(0,30))

train['amount_tsh'] = train_tsh_scaler.fit_transform(train['amount_tsh'].values.reshape(-1,1))
train['gps_height'] = train_gps_scaler.fit_transform(train['gps_height'].values.reshape(-1,1))
train['population'] = train_gps_scaler.fit_transform(train['population'].values.reshape(-1,1))

test_features['amount_tsh'] = test_tsh_scaler.fit_transform(test_features['amount_tsh'].values.reshape(-1,1))
test_features['gps_height'] = test_gps_scaler.fit_transform(test_features['gps_height'].values.reshape(-1,1))
test_features['population'] = test_gps_scaler.fit_transform(test_features['population'].values.reshape(-1,1))

In [11]:
train['age'] = (2019 - train['construction_year']).astype(int)
test_features['age'] = (2019 - test_features['construction_year']).astype(int)

In [12]:
train_days_since = np.array(train['date_recorded'].values, dtype='datetime64')
test_days_since = np.array(test_features['date_recorded'].values,dtype='datetime64')

train_birth = round(train['construction_year'])
test_birth = round(test_features['construction_year'])

train_years_since = np.datetime_as_string(train_days_since, unit='Y')
test_years_since = np.datetime_as_string(test_days_since, unit='Y')

train_years_since = train_years_since.astype(int)
test_years_since = test_years_since.astype(int)

in_train_years = []
in_test_years = []

for i in range(0,len(train_years_since)):
    x = train_years_since[i] - train_birth[i]
    in_train_years.append(x)
    

for i in range(0,len(test_years_since)):
    x = test_years_since[i] - test_birth[i]
    in_test_years.append(x)
    
train['years_until_record'] = in_train_years
test_features['years_until_record'] = in_test_years

In [13]:
train['funder'].fillna('other',inplace=True)
train['installer'].fillna('other',inplace=True)
train['subvillage'].fillna('other',inplace=True)
train['public_meeting'].fillna('other',inplace=True)
train['scheme_management'].fillna('other',inplace=True)
train['scheme_name'].fillna('other',inplace=True)
train['permit'].fillna('other',inplace=True)
test_features['funder'].fillna('other',inplace=True)
test_features['installer'].fillna('other',inplace=True)
test_features['subvillage'].fillna('other',inplace=True)
test_features['public_meeting'].fillna('other',inplace=True)
test_features['scheme_management'].fillna('other',inplace=True)
test_features['scheme_name'].fillna('other',inplace=True)
test_features['permit'].fillna('other',inplace=True)

In [14]:
train.describe(exclude=np.number)

Unnamed: 0,date_recorded,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
count,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400
unique,356,1898,2146,37400,9,19288,21,125,2092,3,1,13,2697,3,18,13,7,12,5,7,7,8,6,5,5,10,7,3,7,6
top,2011-03-15,Government Of Tanzania,DWE,none,Lake Victoria,Madukani,Iringa,Njombe,Igosi,True,GeoData Consultants Ltd,VWC,other,True,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
freq,572,9084,17402,3563,10248,508,5294,2503,307,51011,59400,36793,28166,38852,26780,26780,26780,40507,52490,25348,25348,50818,50818,33186,33186,17021,17021,45794,28522,34625


In [16]:
training = pd.merge(train,train_labels)
training.shape

(59400, 43)

In [19]:
#going to call it train again for simplicity

train = training.copy()

train['funder'].value_counts().head(30)

Government Of Tanzania    9084
other                     3635
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
Kkkt                      1287
World Vision              1246
Unicef                    1057
Tasaf                      877
District Council           843
Dhv                        829
Private Individual         826
Dwsp                       811
0                          777
Norad                      765
Germany Republi            610
Tcrs                       602
Ministry Of Water          590
Water                      583
Dwe                        484
Netherlands                470
Hifab                      450
Adb                        448
Lga                        442
Amref                      425
Fini Water                 393
Oxfam                      359
Wateraid                   333
Rc Church                  321
Name: funder, dtype: int64

In [21]:
train['installer'].value_counts().head(20)

DWE                   17402
other                  3655
Government             1825
RWE                    1206
Commu                  1060
DANIDA                 1050
KKKT                    898
Hesawa                  840
0                       777
TCRS                    707
Central government      622
CES                     610
Community               553
DANID                   552
District Council        551
HESAWA                  539
LGA                     408
World vision            408
WEDECO                  397
TASAF                   396
Name: installer, dtype: int64

In [22]:
train['scheme_management'].value_counts().head(10)

VWC                 36793
WUG                  5206
other                3877
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
Name: scheme_management, dtype: int64

In [20]:
#functions for feature replacement
def top_funders(df):  
    if df['funder']=='Government Of Tanzania':
        return 'gov'
    elif df['funder']=='Danida':
        return 'danida'
    elif df['funder']=='Hesawa':
        return 'hesawa'
    elif df['funder']=='Rwssp':
        return 'rwssp'
    elif df['funder']=='World Bank':
        return 'world_bank'    
    elif df['funder']=='Kkkt':
        return 'Kkkt'
    elif df['funder']=='World Vision':
        return 'World Vision'
    elif df['funder']=='Unicef':
        return 'Unicef'
    elif df['funder']=='Tasaf':
        return 'Tasaf'
    elif df['funder']=='District Council':
        return 'District Council'
    elif df['funder']=='Dhv':
        return 'DHV_Engineering'
    elif df['funder']=='Norad':
        return 'NORAD'
    elif df['funder']=='Private Individual':
        return 'Private'
    elif df['funder']=='Dwsp':
        return 'DWSP'
    elif df['funder']=='Germany Republi':
        return 'Germany'
    elif df['funder']=='Ministry of Water':
        return 'Ministry of Water'
    elif df['funder']=='Water':
        return 'Ministry of Water'
    elif df['funder']=='Lga':
        return 'lga'
    else:
        return 'other'
    
def top_installers(df):
    if df['installer']=='DWE':
        return 'dwe'
    elif df['installer']=='Government':
        return 'gov'
    elif df['installer']=='RWE':
        return 'rwe'
    elif df['installer']=='Commu':
        return 'commu'
    elif df['installer']=='DANIDA':
        return 'danida'
    elif df['installer']=='KKKT':
        return 'kkkt'
    elif df['installer']=='Hesawa':
        return 'hesawa'
    elif df['installer']=='TCRS':
        return 'tcrs'
    elif df['installer']=='Central government':
        return 'Central government'
    elif df['installer']=='CES':
        return 'CES'
    elif df['installer']=='Community':
        return 'commu'
    elif df['installer']=='HESAWA':
        return 'hesawa'
    elif df['installer']=='District Council':
        return 'district_council'
    elif df['installer']=='World vision':
        return 'world_vision'
    elif df['installer']=='TASAF':
        return 'tasaf'
    elif df['installer']=='LGA':
        return 'lga'
    else:
        return 'other' 


In [23]:
train['funder'] = train.apply(lambda x: top_funders(x),axis=1)
train['installer'] = train.apply(lambda x: top_installers(x),axis=1)
test_features['funder'] = test_features.apply(lambda x: top_funders(x),axis=1)
test_features['installer'] = test_features.apply(lambda x: top_installers(x),axis=1)

In [29]:
train.describe(exclude=np.number)

Unnamed: 0,date_recorded,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
count,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400
unique,356,18,15,37400,9,19288,21,125,2092,3,1,13,2697,3,18,13,7,12,5,7,7,8,6,5,5,10,7,3,7,6,3
top,2011-03-15,other,other,none,Lake Victoria,Madukani,Iringa,Njombe,Igosi,True,GeoData Consultants Ltd,VWC,other,True,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
freq,572,32101,30325,3563,10248,508,5294,2503,307,51011,59400,36793,28166,38852,26780,26780,26780,40507,52490,25348,25348,50818,50818,33186,33186,17021,17021,45794,28522,34625,32259


In [30]:
train['extraction_type'].value_counts()

gravity                      26780
nira/tanira                   8154
other                         6430
submersible                   4764
swn 80                        3670
mono                          2865
india mark ii                 2400
afridev                       1770
ksb                           1415
other - rope pump              451
other - swn 81                 229
windmill                       117
india mark iii                  98
cemo                            90
other - play pump               85
walimi                          48
climax                          32
other - mkulima/shinyanga        2
Name: extraction_type, dtype: int64

In [32]:
train['source'].value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     212
unknown                    66
Name: source, dtype: int64

In [37]:
train['waterpoint_type'].value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [39]:
train['waterpoint_type_group'].value_counts()

communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

In [43]:
train['quality_group'].value_counts()

good        50818
salty        5195
unknown      1876
milky         804
colored       490
fluoride      217
Name: quality_group, dtype: int64

In [44]:
train['source'].value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     212
unknown                    66
Name: source, dtype: int64

In [45]:
drop_these=[
    'date_recorded',
    'wpt_name',
    'recorded_by',
    'lga',
    'ward',
    'num_private',
    'subvillage',
    'basin',
    'waterpoint_type_group',
    'extraction_type_group',
    'extraction_type_class',
    'management',
    'payment',
    'quantity',
    'id',
    'source',
]
train.drop(columns=drop_these,inplace=True)
test_features.drop(columns=drop_these,inplace=True)

train.shape, test_features.shape

((59400, 27), (14358, 26))

In [46]:
train.drop(columns=['status_group'],inplace=True)

In [47]:
train.shape, test_features.shape

((59400, 26), (14358, 26))

In [48]:
oe = ce.OrdinalEncoder(verbose=1,mapping=None,cols=None,drop_invariant=True,
                       return_df=True, impute_missing=True,handle_unknown='ignore')

train_features = oe.fit_transform(train)
test_features = oe.fit_transform(test_features)

train_features.shape, test_features.shape

((59400, 26), (14358, 26))

In [76]:
X_train = train_features
X_test = test_features
y_train = train_labels['status_group']
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

lgb_train = lgb.Dataset(X_train, label=y_train)

params = {'boosting_type': 'gbrt',
          'max_depth' : -1,
          'num_threads': -1,
          'objective': 'multiclass',
          'num_leaves': 400,
          'num_iterations':2000,
          'learning_rate': 0.01,
          'max_bin': 888,
          'subsample_for_bin': 200000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'colsample_bytree': 1,
          'reg_alpha': 2,
          'reg_lambda': 5,
          'min_split_gain': 0.85,
          'min_child_weight': 5,
          'min_child_samples': 50,
          'num_class' : 3,
          'metric' : 'multi_logloss'}

model = lgb.LGBMClassifier(
    boosting_type = params['boosting_type'],
    num_leaves = params['num_leaves'],
    objective = params['objective'],
    num_threads = params['num_threads'],
    max_depth = params['max_depth'],
    max_bin = params['max_bin'],
    colsample_bytree = params['colsample_bytree'],
    subsample_for_bin = params['subsample_for_bin'],
    subsample = params['subsample'],
    subsample_freq = params['subsample_freq'],
    reg_alpha = params['reg_alpha'],
    reg_lambda = params['reg_lambda'],
    min_split_gain = params['min_split_gain'],
    min_child_weight = params['min_child_weight'],
    min_child_samples = params['min_child_samples'],
    num_class = params['num_class'],
    metric = params['metric']
)

model.fit(X_train,y_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


LGBMClassifier(boosting_type='gbrt', class_weight=None, colsample_bytree=1,
        importance_type='split', learning_rate=0.1, max_bin=888,
        max_depth=-1, metric='multi_logloss', min_child_samples=50,
        min_child_weight=5, min_split_gain=0.85, n_estimators=100,
        n_jobs=-1, num_class=3, num_leaves=400, num_threads=-1,
        objective='multiclass', random_state=None, reg_alpha=2,
        reg_lambda=5, silent=True, subsample=0.8, subsample_for_bin=200000,
        subsample_freq=1)

In [77]:
submission=sample_submission.copy()
submission['status_group'] = model.predict(X_test)
submission['status_group'].value_counts()

functional                 9251
non functional             4795
functional needs repair     312
Name: status_group, dtype: int64

In [78]:
submission.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [73]:
submission.to_csv('submission-022.csv',index=False)

In [79]:
X_train = train_features
X_test = test_features
y_train = train_labels['status_group']
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

lgb_train = lgb.Dataset(X_train, label=y_train)

params = {'boosting_type': 'rf',
          'max_depth' : -1,
          'num_threads': -1,
          'objective': 'multiclass',
          'num_leaves': 800,
          'num_iterations':4000,
          'learning_rate': 0.001,
          'max_bin': 369,
          'subsample_for_bin': 200000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'colsample_bytree': 1,
          'reg_alpha': 2,
          'reg_lambda': 5,
          'min_split_gain': 0.85,
          'min_child_weight': 5,
          'min_child_samples': 50,
          'num_class' : 3,
          'metric' : 'multi_logloss'}

model = lgb.LGBMClassifier(
    boosting_type = params['boosting_type'],
    num_leaves = params['num_leaves'],
    objective = params['objective'],
    num_threads = params['num_threads'],
    max_depth = params['max_depth'],
    max_bin = params['max_bin'],
    colsample_bytree = params['colsample_bytree'],
    subsample_for_bin = params['subsample_for_bin'],
    subsample = params['subsample'],
    subsample_freq = params['subsample_freq'],
    reg_alpha = params['reg_alpha'],
    reg_lambda = params['reg_lambda'],
    min_split_gain = params['min_split_gain'],
    min_child_weight = params['min_child_weight'],
    min_child_samples = params['min_child_samples'],
    num_class = params['num_class'],
    metric = params['metric']
)

model.fit(X_train,y_train)

submission=sample_submission.copy()
submission['status_group'] = model.predict(X_test)
submission['status_group'].value_counts()


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


functional                 9736
non functional             4270
functional needs repair     352
Name: status_group, dtype: int64

In [80]:
submission.to_csv('submission-022a.csv',index=False)

In [None]:
X_train = train_features
X_test = test_features
y_train = train_labels['status_group']
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

lgb_train = lgb.Dataset(X_train, label=y_train)

params = {'boosting_type': 'dart',
          'max_depth' : -1,
          'num_threads': -1,
          'objective': 'multiclass',
          'num_leaves': 800,
          'num_iterations':2000,
          'learning_rate': 0.001,
          'max_bin': 369,
          'subsample_for_bin': 200000,
          'subsample': 0.8,
          'subsample_freq': 1,
          'colsample_bytree': 1,
          'reg_alpha': 2,
          'reg_lambda': 5,
          'min_split_gain': 0.85,
          'min_child_weight': 5,
          'min_child_samples': 50,
          'num_class' : 3,
          'metric' : 'multi_logloss'}

model = lgb.LGBMClassifier(
    boosting_type = params['boosting_type'],
    num_leaves = params['num_leaves'],
    objective = params['objective'],
    num_threads = params['num_threads'],
    max_depth = params['max_depth'],
    max_bin = params['max_bin'],
    colsample_bytree = params['colsample_bytree'],
    subsample_for_bin = params['subsample_for_bin'],
    subsample = params['subsample'],
    subsample_freq = params['subsample_freq'],
    reg_alpha = params['reg_alpha'],
    reg_lambda = params['reg_lambda'],
    min_split_gain = params['min_split_gain'],
    min_child_weight = params['min_child_weight'],
    min_child_samples = params['min_child_samples'],
    num_class = params['num_class'],
    metric = params['metric']
)

model.fit(X_train,y_train)

submission=sample_submission.copy()
submission['status_group'] = model.predict(X_test)
submission['status_group'].value_counts()
