In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.metrics import r2_score
import lightgbm as lgb

In [None]:
# check on the panda version and its dependencies
# i run this from time to time to ensure all is up to date
pd.__version__
#pd.show_versions()

In [None]:
# i use these to input the relevant file names
# which i downloaded earlier and sit in the same directory as this
file_train = '/kaggle/input/tabular-playground-series-jun-2022/data.csv'
file_test = '/kaggle/input/tabular-playground-series-jun-2022/data.csv'
file_sampleSubmission = '/kaggle/input/tabular-playground-series-jun-2022/sample_submission.csv'

In [None]:
# which makes this a standard cell and keeps the original traing and test data in memory
# use train for the training data, test for the test data
# and also download the sample submission for quick reference
df_train = pd.read_csv(file_train)
df_test = pd.read_csv(file_test)
df_sampleSubmission = pd.read_csv(file_sampleSubmission)

In [None]:
df_train

In [None]:
# my plan here is to go through each column that has null data
# make it the target column and that defines the test data set
# i.e. all the rows with null values in that column
# and then use the rest of the rows as training data for that column
# sp this will be many models trained and executed in one program
# i will use lightgbm given it ignores missing data

In [None]:
# the targets will be every column in the test data with missing data
# i will loop through them making each on the target in turn

col_targets = df_train.columns[df_train.isnull().any()].tolist()

print('number of targets', len(col_targets), col_targets)

In [None]:
# set up the submission dataframe
# i will delete the first dummy row at the end

submission = pd.DataFrame(['remove me'],columns=['row-col'])
submission['value'] = 0.00
submission

In [None]:
# loop through each column with null values
# that column is the target and a new model is trained each time

# loop through those data sets creating a new model for each
for i in range(len(col_targets)):

    # keep a printed record of where we are
    print('starting', col_targets[i], i, 'of', len(col_targets)-1)
    
    final_predictions = []
    final_valid_predictions = {}
    scores = []

    # set up the test data
    df_test_new = df_test.copy()
    df_test_new = df_test_new[df_test_new[col_targets[i]].isnull()]
    df_test_new = df_test_new.reset_index()
    
    # set up the training data
    df_train_new = df_train.copy()
    df_train_new = df_train_new[~df_train_new[col_targets[i]].isnull()]
    df_train_new = df_train_new.reset_index()

    # set up the kfolds
    df_train_new['kfolds'] = -1
    
    # now populate the 'folds' column with a fold identifier
    # try 5 folds for now
    fold_no = 5
    kf = model_selection.KFold(n_splits=fold_no, shuffle=True, random_state=0)
    for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=df_train_new)):
        df_train_new.loc[valid_indicies, "kfolds"] = fold
    
    # go through the folds running the model on each
    for fold in range(fold_no):
        
        # set up the training, validity and test data
        xtrain =  df_train_new[df_train_new['kfolds'] != fold].reset_index(drop=True)
        xvalid = df_train_new[df_train_new['kfolds'] == fold].reset_index(drop=True)
        xtest = df_test_new.copy()

        # i need to keep a record of the row_id for the submission and validity scores
        xtest_ids = xtest.row_id.values.tolist()     
        valid_ids = xvalid.row_id.values.tolist()

        # set up the useful features
        # which is the column headers except index, row_id and kfolds
        useful_features = xtrain.columns.tolist()
        useful_features.remove('index')
        useful_features.remove('row_id')
        useful_features.remove('kfolds')
  
        # set the target column
        ytrain = xtrain[col_targets[i]]
        yvalid = xvalid[col_targets[i]]

        # set the useful_features
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        xtest = xtest[useful_features]
        
        # remove the target column from the training, validity and test data
        xtrain = xtrain.drop(col_targets[i], axis=1)
        xvalid = xvalid.drop(col_targets[i], axis=1)        
        xtest = xtest.drop(col_targets[i], axis=1)

        # run the model
        # n_esimators limited by processing time allowed
        model = lgb.LGBMRegressor(random_state=0, n_jobs=-1, 
                                  n_estimators=1000)
        model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)],
                 callbacks=[lgb.early_stopping(stopping_rounds=1000)])

        # get the validity and test predictions
        preds_valid = model.predict(xvalid)
        test_preds = model.predict(xtest)
        final_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
        
        # print the validity score
        model_score = r2_score(yvalid, preds_valid)
        print(fold, model_score)
        scores.append(model_score)

    print(np.mean(scores), np.std(scores))
    
    # get the mean of the final_predictions across the folds
    df_final_predictions = pd.DataFrame(final_predictions).T
    df_final_predictions['mean_preds'] = df_final_predictions.mean(axis=1)    
    
    # build up the submission file
    strings = [str(x) for x in xtest_ids]
    strings = [s + '-' for s in strings]
    row_col_ids = [s + col_targets[i] for s in strings]
    submission_add = pd.DataFrame(row_col_ids,columns=['row-col'])
    submission_add['value'] = df_final_predictions['mean_preds'].tolist()

    # combine the submission dataframes so far to build up the total submission dataframe
    submission = pd.concat([submission, submission_add])

In [None]:
# remove the dummy first row
submission = submission.iloc[1: , :]

In [None]:
submission

In [None]:
# save as a csv for submission
column_names = ["row-col", "value"]
submission.to_csv("submission.csv", header=column_names, index=False)