In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression,f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb

pd.options.display.max_columns = 100

In [43]:
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')
train_labels = pd.read_csv('train_labels.csv')
sample_submission = pd.read_csv('sample_submission.csv')

train_features.shape, test_features.shape, train_labels.shape, sample_submission.shape

((59400, 40), (14358, 40), (59400, 2), (14358, 2))

In [44]:
train = train_features.copy()

In [45]:
year_mean = train[train['construction_year']>0]['construction_year'].mean()
year_mean = round(year_mean)

test_year_mean = test_features[test_features['construction_year']>0]['construction_year'].mean()
test_year_mean = round(test_year_mean)


train.loc[train['construction_year']==0, 'construction_year'] = int(year_mean)
test_features.loc[test_features['construction_year']==0,'construction_year'] = int(test_year_mean)

In [46]:
import random

def random_std(year):

    return year - random.randint(-10,10)

def random_tsh(amount):

    return amount + random.uniform(-1062.35, 1957.82)

In [47]:
random_tsh(200)

1748.407980002377

In [48]:
train_tsh_mean = train[train['amount_tsh']>0]['amount_tsh'].mean()
test_tsh_mean = test_features[test_features['amount_tsh']>0]['amount_tsh'].mean()

train.loc[train['amount_tsh']==0,'amount_tsh']=float(train_tsh_mean)
test_features.loc[test_features['amount_tsh']==0,'amount_tsh']=float(test_tsh_mean)

In [49]:
# train.loc[train['construction_year']==1997,'construction_year'].apply(random_std(),axis=1)
# test_features.loc[test_features['construction_year']==1997,'construction_year'].apply(random_std(),axis=1)
# train.loc[train['amount_tsh']==train_tsh_mean, 'amount_tsh'].apply(random_tsh(),axis=1)
# test_features.loc[test_features['amount_tsh']==test_tsh_mean, 'amount_tsh'].apply(random_tsh(),axis=1)

# train.shape, test_features.shape

In [50]:
# train.loc[train['construction_year']==1997,'construction_year'].apply(random_std)
# test_features.loc[test_features['construction_year']==1997,'construction_year'].transform(random_std)
# train.loc[train['amount_tsh']==train_tsh_mean, 'amount_tsh'].transform(random_tsh)
# test_features.loc[test_features['amount_tsh']==test_tsh_mean, 'amount_tsh'].transform(random_tsh)

# train.shape, test_features.shape

In [51]:
train.describe(exclude=np.number)

Unnamed: 0,date_recorded,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
count,59400,55765,55745,59400,59400,59029,59400,59400,59400,56066,59400,55523,31234,56344,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400
unique,356,1897,2145,37400,9,19287,21,125,2092,2,1,12,2696,2,18,13,7,12,5,7,7,8,6,5,5,10,7,3,7,6
top,2011-03-15,Government Of Tanzania,DWE,none,Lake Victoria,Madukani,Iringa,Njombe,Igosi,True,GeoData Consultants Ltd,VWC,K,True,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
freq,572,9084,17402,3563,10248,508,5294,2503,307,51011,59400,36793,682,38852,26780,26780,26780,40507,52490,25348,25348,50818,50818,33186,33186,17021,17021,45794,28522,34625


In [52]:
train['funder'].fillna('?',inplace=True)
train['installer'].fillna('?',inplace=True)
train['subvillage'].fillna('?',inplace=True)
train['public_meeting'].fillna('?',inplace=True)
train['scheme_management'].fillna('?',inplace=True)
train['scheme_name'].fillna('?',inplace=True)
train['permit'].fillna('?',inplace=True)
test_features['funder'].fillna('?',inplace=True)
test_features['installer'].fillna('?',inplace=True)
test_features['subvillage'].fillna('?',inplace=True)
test_features['public_meeting'].fillna('?',inplace=True)
test_features['scheme_management'].fillna('?',inplace=True)
test_features['scheme_name'].fillna('?',inplace=True)
test_features['permit'].fillna('?',inplace=True)

In [53]:
train['age'] = (2019 - train['construction_year']).astype(int)
test_features['age'] = (2019 - test_features['construction_year']).astype(int)

In [54]:
train_days_since = np.array(train['date_recorded'].values, dtype='datetime64')
test_days_since = np.array(test_features['date_recorded'].values,dtype='datetime64')

train_birth = train['construction_year'].astype(str)
test_birth = test_features['construction_year'].astype(str)

In [55]:
train_years_since = np.datetime_as_string(train_days_since, unit='Y')
test_years_since = np.datetime_as_string(test_days_since, unit='Y')


In [56]:
train_years_since = train_years_since.astype(int)
test_years_since = test_years_since.astype(int)
train_birth = train_birth.astype(int)
test_birth = test_birth.astype(int)

in_train_years = []
in_test_years = []

for i in range(0,len(train_years_since)):
    x = train_years_since[i] - train_birth[i]
    in_train_years.append(x)
    

for i in range(0,len(test_years_since)):
    x = test_years_since[i] - test_birth[i]
    in_test_years.append(x)

In [57]:
train['years_until_record'] = in_train_years
test_features['years_until_record'] = in_test_years

In [58]:
drop_these=[
    'date_recorded',
    'wpt_name',
    'recorded_by',
    'lga',
    'ward',
    'scheme_name', 
    'scheme_management',
    'funder',
    'installer',
    'num_private',
    'subvillage',
    'basin',
    'longitude',
    'latitude',
    'waterpoint_type_group',
    'extraction_type_group',
    'extraction_type_class',
    'management_group', 
]
train.drop(columns=drop_these,inplace=True)
test_features.drop(columns=drop_these,inplace=True)

train.shape, test_features.shape

((59400, 24), (14358, 24))

In [59]:
str_cols = train.select_dtypes(include=[object])

In [60]:
# hashing = ce.HashingEncoder(
#     cols = list(str_cols.columns),
#     verbose=1,
#     drop_invariant=True,
#     return_df=True,
# )

# train_features = hashing.fit_transform(train)
# test_features = hashing.fit_transform(test_features)

# train_features.shape, test_features.shape

In [61]:
# train_features

In [62]:
oe = ce.OrdinalEncoder(verbose=1,mapping=None,cols=None,drop_invariant=True,
                       return_df=True, impute_missing=True,handle_unknown='ignore')

train_features = oe.fit_transform(train)
test_features = oe.fit_transform(test_features)

train_features.shape, test_features.shape

((59400, 24), (14358, 24))

In [63]:
from sklearn.metrics import log_loss, f1_score

In [None]:
X_train = train_features
X_test = test_features
y_train = train_labels['status_group']
scaler = RobustScaler()
X_train = X_train.drop(columns='id')
X_test = X_test.drop(columns='id')
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

lgb_train = lgb.Dataset(X_train, label=y_train)

params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'multiclass',
          'num_leaves': 42,
          'learning_rate': 0.01,
          'max_bin': 255,
          'subsample_for_bin': 100000,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 2,
          'reg_lambda': 5,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 25,
          'num_class' : 3,
          'metric' : 'multi_logloss'}

model = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    objective = 'multiclass',
    n_jobs = -1,
    max_depth = params['max_depth'],
    max_bin = params['max_bin'],
    subsample_for_bin = params['subsample_for_bin'],
    subsample = params['subsample'],
    subsample_freq = params['subsample_freq'],
    min_split_gain = params['min_split_gain'],
    min_child_weight = params['min_child_weight'],
    min_child_samples = params['min_child_samples'],
    
)


param_grid = {
    'boosting_type': ['dart'],
    'learning_rate': [0.01],
    'num_iterations': [2000,4000],
    'num_leaves': [400,800],
    'min_child_samples': [20],
    'subsample_for_bin': [100000],    
    'reg_alpha': [1],
    'reg_lambda': [1],
    'min_split_gain': [0.5],
    'max_bin': [128,255]
}


# param_grid = {
#     'boosting_type': ['gbdt', 'dart'],
#     'learning_rate': [0.001,0.01],
#     'num_iterations': [1000,2000],
#     'num_leaves': [100,200],
#     'min_child_samples': [25,50],
#     'subsample_for_bin': [200000],    
#     'reg_alpha': [1.2],
#     'reg_lambda': [1.2],
#     'min_split_gain': [0.5]
# }
# hopefully this works lol
gridsearch = GridSearchCV(model, param_grid = param_grid,
                          verbose=0,cv=3,n_jobs=-1,error_score='log_loss')

gridsearch.fit(X_train,y_train)

In [None]:
gridsearch.best_params_

In [None]:
gridsearch.best_score_

In [None]:
params['learning_rate'] = gridsearch.best_params_['learning_rate']
params['num_leaves'] = gridsearch.best_params_['num_leaves']
params['num_iterations'] = gridsearch.best_params_['num_iterations']
params['subsample_for_bin'] = gridsearch.best_params_['subsample_for_bin']
params['min_child_samples'] = gridsearch.best_params_['min_child_samples']
params['reg_alpha'] = gridsearch.best_params_['reg_alpha']
params['reg_lambda'] = gridsearch.best_params_['reg_lambda']
params['min_split_gain'] = gridsearch.best_params_['min_split_gain']


In [None]:
submission=sample_submission.copy()
submission['status_group'] = gridsearch.predict(X_test)
submission.head()

In [None]:
submission.to_csv('submission-017.csv',index=False)

In [30]:
train_features.head()

Unnamed: 0,id,amount_tsh,gps_height,region,region_code,district_code,population,public_meeting,permit,construction_year,extraction_type,management,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,age,years_until_record
0,69572,6000.0,1390,1,11,5,109,1,1,1999,1,1,1,1,1,1,1,1,1,1,1,1,20,12
1,8776,1062.351942,1399,2,20,2,280,2,2,2010,1,2,2,2,1,1,2,2,2,2,2,1,9,3
2,34310,25.0,686,3,21,4,250,1,2,2009,1,1,3,3,1,1,1,1,3,3,2,2,10,4
3,67743,1062.351942,263,4,90,63,58,1,2,1986,2,1,2,2,1,1,3,3,4,4,1,2,33,27
4,19728,1062.351942,0,5,18,1,0,1,2,1997,1,3,2,2,1,1,4,4,2,2,2,1,22,14


In [31]:
test_features.head()

Unnamed: 0,id,amount_tsh,gps_height,region,region_code,district_code,population,public_meeting,permit,construction_year,extraction_type,management,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,age,years_until_record
0,50785,1070.88813,1996,3,21,3,321,1,2,2012,6,9,2,2,1,1,4,4,2,2,2,4,7,1
1,51630,1070.88813,1569,17,2,2,300,1,2,2000,1,1,2,2,1,1,2,2,1,1,1,1,19,13
2,17168,1070.88813,1567,19,13,2,500,1,3,2010,6,1,2,2,1,1,2,2,2,2,2,4,9,3
3,45559,1070.88813,267,15,80,43,250,2,2,1987,6,1,4,4,1,1,3,3,6,6,1,4,32,26
4,49871,500.0,1260,10,10,3,60,2,2,2000,1,5,7,7,1,1,1,1,1,1,1,1,19,13


In [None]:
#TODOs:
#get the std function to work
#get the apply method to work for our amount and years
#probably go to office hours at 1


In [33]:
X_train = train_features
X_test = test_features
y_train = train_labels['status_group']
scaler = StandardScaler()
X_train = X_train.drop(columns='id')
X_test = X_test.drop(columns='id')
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

lgb_train = lgb.Dataset(X_train, label=y_train)

params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'multiclass',
          'num_leaves': 42,
          'learning_rate': 0.01,
          'max_bin': 512,
          'subsample_for_bin': 100000,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 2,
          'reg_lambda': 5,
          'min_split_gain': 0.5,
          'min_child_weight': 3,
          'min_child_samples': 25,
          'num_class' : 3,
          'metric' : 'multi_logloss'}

model = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    objective = 'multiclass',
    n_jobs = -1,
    max_depth = params['max_depth'],
    max_bin = params['max_bin'],
    subsample_for_bin = params['subsample_for_bin'],
    subsample = params['subsample'],
    subsample_freq = params['subsample_freq'],
    min_split_gain = params['min_split_gain'],
    min_child_weight = params['min_child_weight'],
    min_child_samples = params['min_child_samples'],
    
)


param_grid = {
    'boosting_type': ['dart'],
    'learning_rate': [0.001,0.01],
    'num_iterations': [1000,2000],
    'num_leaves': [200,400],
    'min_child_samples': [24,42],
    'subsample_for_bin': [200000,400000],    
    'reg_alpha': [1.2],
    'reg_lambda': [1.2],
    'min_split_gain': [0.95]
}


# param_grid = {
#     'boosting_type': ['gbdt', 'dart'],
#     'learning_rate': [0.001,0.01],
#     'num_iterations': [1000,2000],
#     'num_leaves': [100,200],
#     'min_child_samples': [25,50],
#     'subsample_for_bin': [200000],    
#     'reg_alpha': [1.2],
#     'reg_lambda': [1.2],
#     'min_split_gain': [0.5]
# }
# hopefully this works lol
gridsearch = GridSearchCV(model, param_grid = param_grid,
                          verbose=1,cv=3,n_jobs=-1,scoring='accuracy')

gridsearch.fit(X_train,y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 78.8min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 234.0min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_bin=512,
        max_depth=-1, min_child_samples=25, min_child_weight=3,
        min_split_gain=0.5, n_estimators=100, n_jobs=-1, num_leaves=31,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1, subsample_for_bin=100000,
        subsample_freq=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'boosting_type': ['dart'], 'learning_rate': [0.001, 0.01], 'num_iterations': [1000, 2000], 'num_leaves': [200, 400], 'min_child_samples': [24, 42], 'subsample_for_bin': [200000, 400000], 'reg_alpha': [1.2], 'reg_lambda': [1.2], 'min_split_gain': [0.95]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [34]:
gridsearch.best_params_

{'boosting_type': 'dart',
 'learning_rate': 0.01,
 'min_child_samples': 24,
 'min_split_gain': 0.95,
 'num_iterations': 2000,
 'num_leaves': 400,
 'reg_alpha': 1.2,
 'reg_lambda': 1.2,
 'subsample_for_bin': 200000}

In [35]:
gridsearch.best_score_

0.7808249158249159

In [36]:
params['learning_rate'] = gridsearch.best_params_['learning_rate']
params['num_leaves'] = gridsearch.best_params_['num_leaves']
params['num_iterations'] = gridsearch.best_params_['num_iterations']
params['subsample_for_bin'] = gridsearch.best_params_['subsample_for_bin']
params['min_child_samples'] = gridsearch.best_params_['min_child_samples']
params['reg_alpha'] = gridsearch.best_params_['reg_alpha']
params['reg_lambda'] = gridsearch.best_params_['reg_lambda']
params['min_split_gain'] = gridsearch.best_params_['min_split_gain']
submission=sample_submission.copy()
submission['status_group'] = gridsearch.predict(X_test)
submission.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [37]:
submission.to_csv('submission-016.csv',index=False)

In [None]:
X_train = train_features
X_test = test_features
y_train = train_labels['status_group']
scaler = StandardScaler()
X_train = X_train.drop(columns='id')
X_test = X_test.drop(columns='id')
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

lgb_train = lgb.Dataset(X_train, label=y_train)

params = {'boosting_type': 'dart',
          'max_depth' : -1,
          'num_threads': -1,
          'objective': 'multiclass',
          'num_leaves': 800,
          'num_iterations':4000,
          'learning_rate': 0.01,
          'max_bin': 190,
          'subsample_for_bin': 100000,
          'subsample': 0.85,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 2,
          'reg_lambda': 5,
          'min_split_gain': 0.5,
          'min_child_weight': 3,
          'min_child_samples': 25,
          'num_class' : 3,
          'metric' : 'multi_logloss'}

model = lgb.LGBMClassifier(
    boosting_type = params['boosting_type'],
    objective = params['objective'],
    num_threads = params['num_threads'],
    max_depth = params['max_depth'],
    max_bin = params['max_bin'],
    colsample_bytree = params['colsample_bytree'],
    subsample_for_bin = params['subsample_for_bin'],
    subsample = params['subsample'],
    subsample_freq = params['subsample_freq'],
    reg_alpha = params['reg_alpha'],
    reg_lambda = params['reg_lambda'],
    min_split_gain = params['min_split_gain'],
    min_child_weight = params['min_child_weight'],
    min_child_samples = params['min_child_samples'],
    num_class = params['num_class'],
    metric = params['metric']
)


