In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression,f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')
train_labels = pd.read_csv('train_labels.csv')
sample_submission = pd.read_csv('sample_submission.csv')

train_features.shape, test_features.shape, train_labels.shape, sample_submission.shape

((59400, 40), (14358, 40), (59400, 2), (14358, 2))

In [3]:
train = train_features.copy()

In [4]:
year_mean = train[train['construction_year']>0]['construction_year'].mean()
year_mean = round(year_mean)

test_year_mean = test_features[test_features['construction_year']>0]['construction_year'].mean()
test_year_mean = round(test_year_mean)


train.loc[train['construction_year']==0, 'construction_year'] = int(year_mean)
test_features.loc[test_features['construction_year']==0,'construction_year'] = int(test_year_mean)

In [5]:
import random

def random_std(year):
    
    std = year - random.randint(-10,10)
    
    return std    

def random_tsh(amount):
    
    std = amount + np.random.uniform(1062.35, 2957.82)
    return std

In [6]:
train_tsh_mean = train[train['amount_tsh']>0]['amount_tsh'].mean()
test_tsh_mean = test_features[test_features['amount_tsh']>0]['amount_tsh'].mean()

In [7]:
train.loc[train['construction_year']==1997, 'construction_year'].apply(random_std)
test_features.loc[test_features['construction_year']==1997, 'construction_year'].apply(random_std)
train.loc[train['amount_tsh']==0, 'amount_tsh'] = train_tsh_mean
test_features.loc[test_features['amount_tsh']==0, 'amount_tsh'] = test_tsh_mean

train.shape, test_features.shape

((59400, 40), (14358, 40))

In [8]:
train['funder'].fillna('?',inplace=True)
train['installer'].fillna('?',inplace=True)
train['subvillage'].fillna('?',inplace=True)
train['public_meeting'].fillna('?',inplace=True)
train['scheme_management'].fillna('?',inplace=True)
train['scheme_name'].fillna('?',inplace=True)
train['permit'].fillna('?',inplace=True)
test_features['funder'].fillna('?',inplace=True)
test_features['installer'].fillna('?',inplace=True)
test_features['subvillage'].fillna('?',inplace=True)
test_features['public_meeting'].fillna('?',inplace=True)
test_features['scheme_management'].fillna('?',inplace=True)
test_features['scheme_name'].fillna('?',inplace=True)
test_features['permit'].fillna('?',inplace=True)

In [9]:
train['age'] = (2019 - train['construction_year']).astype(int)
test_features['age'] = (2019 - test_features['construction_year']).astype(int)

In [9]:
train['management'].unique()

array(['vwc', 'wug', 'other', 'private operator', 'water board', 'wua',
       'company', 'water authority', 'parastatal', 'unknown',
       'other - school', 'trust'], dtype=object)

In [10]:
train['region_code'].unique()

array([11, 20, 21, 90, 18,  4, 17, 14, 60, 10,  3, 15, 19, 16, 80,  1,  6,
        2, 12, 13,  5,  7, 99, 24,  9,  8, 40])

In [11]:
train['date_recorded'].nunique()

356

In [12]:
type(train['date_recorded'][2])

str

In [13]:
train_days_since = np.array(train['date_recorded'].values, dtype='datetime64')
test_days_since = np.array(test_features['date_recorded'].values,dtype='datetime64')

train_birth = train['construction_year'].astype(str)
test_birth = test_features['construction_year'].astype(str)

In [14]:
train_years_since = np.datetime_as_string(train_days_since, unit='Y')
test_years_since = np.datetime_as_string(test_days_since, unit='Y')


In [15]:
len(train_years_since)

59400

In [16]:
len(test_years_since)

14358

In [17]:
train_years_since = train_years_since.astype(int)
test_years_since = test_years_since.astype(int)
train_birth = train_birth.astype(int)
test_birth = test_birth.astype(int)

in_train_years = []
in_test_years = []

for i in range(0,len(train_years_since)):
    x = train_years_since[i] - train_birth[i]
    in_train_years.append(x)
    

for i in range(0,len(test_years_since)):
    x = test_years_since[i] - test_birth[i]
    in_test_years.append(x)

In [18]:
train['years_until_record'] = in_train_years
test_features['years_until_record'] = in_test_years

In [19]:
drop_these=[
    'date_recorded',
    'wpt_name',
    'recorded_by',
    'lga',
    'ward',
    'scheme_name', 
    'scheme_management',
    'funder',
    'installer',
    'num_private',
    'subvillage',
    'basin',
    'longitude',
    'latitude',
    'region',
    'waterpoint_type_group',
    'extraction_type_group',
    'extraction_type_class',
    'management_group', 
]
train.drop(columns=drop_these,inplace=True)
test_features.drop(columns=drop_these,inplace=True)

train.shape, test_features.shape

((59400, 22), (14358, 22))

In [20]:
ohe = ce.OneHotEncoder(use_cat_names=True,handle_unknown='ignore',impute_missing=False)

train_features = ohe.fit_transform(train)
test_features = ohe.fit_transform(test_features)

train_features.shape, test_features.shape

((59400, 109), (14358, 108))

In [21]:
columns = train_features.columns
test_cols = test_features.columns
for col in columns:
    if col not in test_cols:
        print(col)

extraction_type_other - mkulima/shinyanga


In [22]:
train_features.drop(columns=['extraction_type_other - mkulima/shinyanga'],inplace=True)

In [23]:
X_train = train_features
X_test = test_features
y_train = train_labels['status_group']
y_train.shape

(59400,)

In [24]:
# pipeline = make_pipeline(
#     StandardScaler(),
#     SelectKBest(f_classif),
#     LogisticRegression()
# )

# param_grid = {
#     'selectkbest__k': range(1,X_train.shape[1]+1),
#     'logisticregression__solver':['lbfgs'],
#     'logisticregression__multi_class':['multinomial'],
#     'logisticregression__warm_start':[True],
#     'logisticregression__max_iter':[500,1000,1500],
#     'logisticregression__class_weight':[None, 'balanced'],
#     'logisticregression__random_state':[42,101,369]
# }

# gridsearch = GridSearchCV(pipeline, param_grid = param_grid, cv=5, 
#                           scoring='accuracy', iid=False, verbose=10,
#                           refit=True)
# gridsearch.fit(X_train,y_train)

In [25]:
# print(gridsearch.best_estimator_)
# print(gridsearch.best_score_)
# print(gridsearch.best_params_)


In [26]:
# submission=sample_submission.copy()
# submission['status_group'] = gridsearch.predict(X_test)
# submission.head()

In [27]:
# submission.to_csv('submission-015.csv',index=False)

In [None]:
X_train = train_features
X_test = test_features
y_train = train_labels['status_group']
scaler = RobustScaler()
X_train = X_train.drop(columns='id')
X_test = X_test.drop(columns='id')
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

lgb_train = lgb.Dataset(X_train, label=y_train)

params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'multiclass',
          'num_leaves': 42,
          'learning_rate': 0.01,
          'max_bin': 512,
          'subsample_for_bin': 100000,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 2,
          'reg_lambda': 5,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 25,
          'num_class' : 3,
          'metric' : 'multi_logloss'}

model = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    objective = 'multiclass',
    n_jobs = -1,
    max_depth = params['max_depth'],
    max_bin = params['max_bin'],
    subsample_for_bin = params['subsample_for_bin'],
    subsample = params['subsample'],
    subsample_freq = params['subsample_freq'],
    min_split_gain = params['min_split_gain'],
    min_child_weight = params['min_child_weight'],
    min_child_samples = params['min_child_samples'],
    
)

param_grid = {
    'boosting_type': ['gbdt', 'dart'],
    'learning_rate': [0.001,0.01],
    'num_iterations': [1000,2000],
    'num_leaves': [100,200],
    'min_child_samples': [25,50],
    'max_bin': [200000],    
    'reg_alpha': [1.2],
    'reg_lambda': [1.2],
    'min_split_gain': [0.5]
}
# hopefully this works lol
gridsearch = GridSearchCV(model, param_grid = param_grid,
                          verbose=1,cv=3,n_jobs=,scoring='accuracy')

gridsearch.fit(X_train,y_train)

In [18]:
submission=sample_submission.copy()
submission['status_group'] = gridsearch.predict(X_test)
submission.head()

  Xt = transform.transform(Xt)


Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [19]:
submission.to_csv('submission-016.csv',index=False)

In [20]:
X_train.columns

Index(['public_meeting_True', 'public_meeting_?', 'public_meeting_False',
       'permit_False', 'permit_True', 'permit_?', 'management_vwc',
       'management_wug', 'management_other', 'management_private operator',
       'management_water board', 'management_wua', 'management_company',
       'management_water authority', 'management_parastatal',
       'management_unknown', 'management_other - school', 'management_trust',
       'management_group_user-group', 'management_group_other',
       'management_group_commercial', 'management_group_parastatal',
       'management_group_unknown', 'payment_pay annually', 'payment_never pay',
       'payment_pay per bucket', 'payment_unknown',
       'payment_pay when scheme fails', 'payment_other', 'payment_pay monthly',
       'payment_type_annually', 'payment_type_never pay',
       'payment_type_per bucket', 'payment_type_unknown',
       'payment_type_on failure', 'payment_type_other', 'payment_type_monthly',
       'water_quality_soft',

In [24]:
# scaler = RobustScaler()
# scaler.fit(train_features)
# X_train = scaler.transform(train_features)
# X_test = scaler.transform(test_features)

In [25]:
# pipeline = make_pipeline(
#     DecisionTreeClassifier()
# )
# param_grid={
#     'decisiontreeclassifier__criterion': ['gini'], 
#     'decisiontreeclassifier__min_samples_split': [20,30,40],
#     'decisiontreeclassifier__max_depth': [25,50,100],
#     'decisiontreeclassifier__min_samples_leaf': [3,5,7]
# }

# gridsearch = GridSearchCV(pipeline,param_grid=param_grid,cv=5,scoring='neg_mean_absolute_error',
#                           verbose=1,return_train_score=True)

# gridsearch.fit(train_features, y_train)

In [26]:
scaler = RobustScaler()
scaler.fit(train_features)
X_train = scaler.transform(train_features)
model = RandomForestClassifier()
model.set_params(n_estimators=200,min_samples_leaf=5 ,n_jobs=-1,max_features=0.5)
model.fit(X_train,y_train)

submission=sample_submission.copy()
submission['status_group'] = model.predict(X_test)
submission.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [27]:
submission.to_csv('submission-011.csv',index=False)

In [35]:
X_train = train_features.astype(float)
X_test = test_features.astype(float)

In [36]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [37]:
from sklearn.neural_network import MLPClassifier

In [38]:
model = MLPClassifier()

In [45]:
model.set_params(hidden_layer_sizes=(20,40,60,40,20), max_iter=500,
                 solver='sgd')
model.fit(X_train,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 40, 60, 40, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [46]:
model.score(X_train, y_train)

0.7957575757575758

In [50]:
submission=sample_submission.copy()
submission['status_group'] = model.predict(X_test)
submission.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,non functional
2,17168,functional
3,45559,non functional
4,49871,non functional


In [51]:
submission.to_csv('submission-013.csv',index=False)