In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
print(os.listdir("../input"))

from contextlib import contextmanager
import time

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import warnings
import gc
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
from hyperopt import hp, tpe, Trials, fmin, STATUS_OK
from hyperopt.pyll.stochastic import sample
from sklearn.metrics import f1_score, make_scorer
import csv
import ast
from timeit import default_timer as timer

In [None]:
def readCSV():
    train = pd.read_csv('../input/traintest-set/final_training_set.csv')
    test = pd.read_csv('../input/traintest-set/final_testing_set.csv')
    train = train.drop(train.columns[0], axis=1)
    test = test.drop(test.columns[0], axis=1)

    train.drop(columns=['idhogar','Id'], inplace=True)
    test.drop(columns=['idhogar','Id','Target'], inplace=True)
    return train, test

# Bayesian Optimsation

In [None]:
train, test = readCSV()
train_labels = train['Target']

In [None]:
def objective(hyperparameters, nfolds=5):
    """Return validation score from hyperparameters for LightGBM"""
    
    # Keep track of evals
    global ITERATION
    ITERATION += 1
    
    # Retrieve the subsample
    subsample = hyperparameters['boosting_type'].get('subsample', 1.0)
    subsample_freq = hyperparameters['boosting_type'].get('subsample_freq', 0)
    
    boosting_type = hyperparameters['boosting_type']['boosting_type']
    
    if boosting_type == 'dart':
        hyperparameters['drop_rate'] = hyperparameters['boosting_type']['drop_rate']
    
    # Subsample and subsample frequency to top level keys
    hyperparameters['subsample'] = subsample
    hyperparameters['subsample_freq'] = subsample_freq
    hyperparameters['boosting_type'] = boosting_type
    
    # Whether or not to use limit maximum depth
    if not hyperparameters['limit_max_depth']:
        hyperparameters['max_depth'] = -1
    
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['max_depth', 'num_leaves', 'subsample_for_bin', 
                           'min_child_samples', 'subsample_freq']:
        hyperparameters[parameter_name] = int(hyperparameters[parameter_name])

    if 'n_estimators' in hyperparameters:
        del hyperparameters['n_estimators']
    
    # Using stratified kfold cross validation
    strkfold = StratifiedKFold(n_splits = nfolds, shuffle = True)
    
    # Convert to arrays for indexing
    features = np.array(train)
    labels = np.array(train_labels).reshape((-1 ))
    
    valid_scores = []
    best_estimators = []
    run_times = []
    
    model = lgb.LGBMClassifier(**hyperparameters, class_weight = 'balanced',
                               n_jobs=-1, metric = 'None',
                               n_estimators=10000)
    
    # Iterate through the folds
    for i, (train_indices, valid_indices) in enumerate(strkfold.split(features, labels)):
        
        # Training and validation data
        X_train = features[train_indices]
        X_valid = features[valid_indices]
        y_train = labels[train_indices]
        y_valid = labels[valid_indices]
        
        start = timer()
        # Train with early stopping
        model.fit(X_train, y_train, early_stopping_rounds = 100, 
                  eval_metric = macro_f1_score, 
                  eval_set = [(X_train, y_train), (X_valid, y_valid)],
                  eval_names = ['train', 'valid'],
                  verbose = 400)
        end = timer()
        # Record the validation fold score
        valid_scores.append(model.best_score_['valid']['macro_f1'])
        best_estimators.append(model.best_iteration_)
        
        run_times.append(end - start)
    
    score = np.mean(valid_scores)
    score_std = np.std(valid_scores)
    loss = 1 - score
    
    run_time = np.mean(run_times)
    run_time_std = np.std(run_times)
    
    estimators = int(np.mean(best_estimators))
    hyperparameters['n_estimators'] = estimators
    
    # Write to the csv file ('a' means append)
    of_connection = open(OUT_FILE, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, hyperparameters, ITERATION, run_time, score, score_std])
    of_connection.close()
    
    # Display progress
    if ITERATION % PROGRESS == 0:
        display(f'Iteration: {ITERATION}, Current Score: {round(score, 4)}.')
    
    return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION,
            'time': run_time, 'time_std': run_time_std, 'status': STATUS_OK, 
            'score': score, 'score_std': score_std}

In [None]:
# Define the search space
space = {
    'boosting_type': hp.choice('boosting_type', 
                              [{'boosting_type': 'gbdt', 
                                'subsample': hp.uniform('gdbt_subsample', 0.5, 1),
                                'subsample_freq': hp.quniform('gbdt_subsample_freq', 1, 10, 1)}, 
                               {'boosting_type': 'dart', 
                                 'subsample': hp.uniform('dart_subsample', 0.5, 1),
                                 'subsample_freq': hp.quniform('dart_subsample_freq', 1, 10, 1),
                                 'drop_rate': hp.uniform('dart_drop_rate', 0.1, 0.5)},
                                {'boosting_type': 'goss',
                                 'subsample': 1.0,
                                 'subsample_freq': 0}]),
    'limit_max_depth': hp.choice('limit_max_depth', [True, False]),
    'max_depth': hp.quniform('max_depth', 1, 40, 1),
    'num_leaves': hp.quniform('num_leaves', 3, 50, 1),
    'learning_rate': hp.loguniform('learning_rate', 
                                   np.log(0.025), 
                                   np.log(0.25)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 2000, 100000, 2000),
    'min_child_samples': hp.quniform('min_child_samples', 5, 80, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.5, 1.0)
}

In [None]:
def macro_f1_score(labels, predictions):
    # Reshape the predictions as needed
    predictions = predictions.reshape(len(np.unique(labels)), -1 ).argmax(axis = 0)
    
    metric_value = f1_score(labels, predictions, average = 'macro')
    
    # Return is name, value, is_higher_better
    return 'macro_f1', metric_value, True

In [None]:
# algo = tpe.suggest
# # Record results
# trials = Trials()

# # Create a file and open a connection
# OUT_FILE = 'optimization.csv'
# of_connection = open(OUT_FILE, 'w')
# writer = csv.writer(of_connection)

# MAX_EVALS = 100
# PROGRESS = 10
# N_FOLDS = 5
# ITERATION = 0

# # Write column names
# headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score', 'std']
# writer.writerow(headers)
# of_connection.close()

# best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials,
#             max_evals = MAX_EVALS)

In [None]:
# import json

# # Save the trial results
# with open('trials.json', 'w') as f:
#     f.write(json.dumps(str(trials)))

# Model Training

In [None]:
def model_training(train,test):
    #parameter value is copied from 
    y = train['Target']
    train.drop(columns=['Target'], inplace=True)
    clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.1, objective='multiclass',
                                 random_state=None, silent=True, metric='None', 
                                 n_jobs=4, n_estimators=5500, class_weight='balanced',
                                 colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)
    
    kfold = 7
    kf = StratifiedKFold(n_splits=kfold, shuffle=True)

    predicts_result = []
    for train_index, test_index in kf.split(train, y):
        print("###")
        X_train, X_val = train.iloc[train_index], train.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        clf.fit(X_train, y_train, eval_set=[(X_val, y_val)],early_stopping_rounds=100)
        predicts_result.append(clf.predict(test))
    
    return predicts_result
       

# Load Main Process Function at Once

In [None]:
def readCSV():
    train = pd.read_csv('../input/traintest-set/final_training_set.csv')
    test = pd.read_csv('../input/traintest-set/final_testing_set.csv')
    train = train.drop(train.columns[0], axis=1)
    test = test.drop(test.columns[0], axis=1)

    train.drop(columns=['idhogar','Id'], inplace=True)
    test.drop(columns=['idhogar','Id','Target'], inplace=True)
    return train, test

In [None]:
def main(debug = False):
    train, test = readCSV()

    predicts_result = model_training(train,test)

    # Prepare results
    results = pd.read_csv('../input/traintest-set/final_testing_set.csv')
    results['Target'] = np.array(predicts_result).mean(axis=0).round().astype(int)
    results = results[['idhogar','Target']].copy()
    results.to_csv('intermediate.csv', index = False)
    test = pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')
    test = test[['Id','idhogar']].copy()

    #Prepare submission
    submission = pd.merge(test,results, on='idhogar', how='outer')
    submission.fillna(4, inplace=True)
    submission.drop(columns='idhogar', inplace= True)
    submission = submission.astype({'Target': int})
    submission.to_csv('submission.csv', index = False)

In [None]:
if __name__ == "__main__":
    main(debug= False)

In [None]:
from IPython.display import FileLink
FileLink('submission.csv')


In [None]:
df= pd.read_csv('./submission.csv')
df.loc[df['Id'] == 'ID_f09603838']
df.head()

In [None]:
test = pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')
len(test['idhogar'].unique())

In [None]:
test1 = pd.read_csv('../input/traintest-set/final_testing_set.csv')
len(test1['idhogar'].unique())

In [None]:
set1 = set(test['idhogar'])
set2=set(test1['idhogar'])
missing = list(sorted(set1 - set2))
added = list(sorted(set2 - set1))
print(missing)
print(added)

In [None]:
from IPython.display import FileLink
FileLink('intermediate.csv')
