In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")
plt.style.use('ggplot')

import gc

from sklearn.impute import SimpleImputer
import xgboost
from lightgbm import LGBMRegressor as lgbm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
df = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")
submission = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
for i in df.columns:
    if df[i].dtypes == 'float64':
        df[i] = df[i].astype('float16')
    elif df[i].dtypes == 'int64':
        df[i] = df[i].astype('int16')

In [None]:
%%capture
# https://www.kaggle.com/code/abhishek/running-lightgbm-on-gpu/notebook
!pip uninstall -y lightgbm
!apt-get install -y libboost-all-dev
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
%%capture
!cd LightGBM/python-package/;python setup.py install --precompile
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

In [None]:
def percent_missing(df):
    percent_nan = 100* df.isnull().sum() / len(df)
    percent_nan = percent_nan[percent_nan>0].sort_values(ascending=False)
    return percent_nan

In [None]:
percent_nan = percent_missing(df)

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x=percent_nan.index,y=percent_nan)
plt.xticks(rotation=90);
plt.xlabel('Columns')
plt.ylabel('% missing values')

In [None]:
features = list(df.columns)
features_1, features_2, features_3, features_4 = [], [], [], []
F = [[], [], [], [], []]
for feature in features:
    for i in [1, 2, 3, 4]:
        if feature.split('_')[1] == str(i):
            F[i].append(feature)
data = [[], [], [], [], []]

fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(18, 30))

for i in [1, 2, 3, 4]:
    data[i] = df[F[i]]
    corr = data[i].corr()
    sns.heatmap(corr, ax=axs[i-1], annot=True)

In [None]:
import lightgbm as lgb

lgb_params = {
      'boosting_type': 'gbdt',
      'metric':'mae',
      'learning_rate': 0.01,
      'num_leaves': 16,  
      'max_depth': 4,    
      "max_bin": 63,  
      'subsample': 0.6,  
      'colsample_bytree': 0.4,  
      'verbose': 1,
      'seed' : 1983, 
      'device_type':"gpu"
     }
callbacks = [
            lgb.log_evaluation(250),
            lgb.early_stopping(20),
            ]

In [None]:
X.shape[0]*0.01

In [None]:
y.shape

In [None]:
%%time
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
for i in [4]:
    dummy_df = pd.DataFrame()
    dummy_df2 = pd.DataFrame()
    col_train = pd.DataFrame()
    col_test = pd.DataFrame()
    dummy_df=data[i].copy()
    dummy_df2=data[i].copy()
    for column in dummy_df.columns: 
        print('Processing Colunm Name : ', column)
        if dummy_df[column].isnull().sum() == 0:
            print(dummy_df[column].isnull().sum())
            continue    # continue as no NaN values found in this column
        col_nan_ix = dummy_df[dummy_df[column].isnull()].index  # identify the rows which has NaN in column F_1_0
        col_train = dummy_df.drop(col_nan_ix, axis = 0)  #training set which has F_1_0 fixed value but other columns might have NaN values
        col_test = dummy_df[dummy_df.index.isin(col_nan_ix)]       
    
        
        X = col_train.drop([column],axis=1)
        y = col_train[column]
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.9, random_state=42)
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        model = lgb.train(lgb_params, lgb_train, callbacks=callbacks, valid_sets = [lgb_valid])
        dummy_df2[column][col_nan_ix] = model.predict(col_test.drop([column],axis=1))
    data[i]=dummy_df2.copy()

In [None]:
data[4].head()

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan,strategy='mean') 
for i in [1,3]:    
    data[i][:] = imp.fit_transform(data[i])

In [None]:
Merged_Subsets = pd.concat([data[1], data[2], data[3], data[4]], axis=1)
submission = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = Merged_Subsets.loc[row, col]

submission.to_csv('submission.csv')