# Load Competition Dataset

Competition dataset located in "/kaggle/input"; This path defined by Kaggle to access the competition file. We will list two files from this path as input files.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path=os.path.join(dirname, filename)
        if 'train' in path:
            __training_path=path
        elif 'test' in path:
            __test_path=path

## Check training and test path

In [None]:
#loaded files
print(f'Training path:{__training_path}\nTest path:{__test_path}')

In [None]:
# Kaggle Environment Prepration
#update kaggle env
import sys
#you may update the environment that allow you to run the whole code
#!{sys.executable} -m pip install --upgrade scikit-learn=="0.24.2" 
#record this information if you need to run the Kernel internally
import sklearn; sklearn.show_versions() 

# Input Dataset

In [None]:
def __load__data(__training_path, __test_path, concat=False):
	"""load data as input dataset
	params: __training_path: the training path of input dataset
	params: __test_path: the path of test dataset
	params: if it is True, then it will concatinate the training and test dataset as output
	returns: generate final loaded dataset as dataset, input and test
	"""
	# LOAD DATA
	import pandas as pd
	__train_dataset = pd.read_csv(__training_path, delimiter=',')
	__test_dataset = pd.read_csv(__test_path, delimiter=',')
	if not concat:
	    __dataset = __train_dataset.copy()
	else:
	    __dataset = pd.concat([__train_dataset, __test_dataset], axis=0
	        ).reset_index(drop=True)
	return __dataset, __train_dataset, __test_dataset
__dataset, __train_dataset, __test_dataset = __load__data(__training_path, __test_path, concat=True)
__dataset

# Generate Submission file
We have to maintain the following columns in submission.csv.
then, we can drop that column(s) from original dataset because it is unique and not useful for training a model.
In some cases, an ID may carry useful information such as student ID where it may consist of admission year and other related info.

In [None]:
submission_columns = ['id']

In [None]:
submission=pd.DataFrame(__test_dataset[submission_columns].copy())

In [None]:
# DISCARD IRRELEVANT COLUMNS
__dataset.drop(['id'], axis=1, inplace=True)

## Remove Missing Data in Numerical Columns

In the given input dataset there are <b>118 columns </b> with  missing data as follows:

f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22, f23, f24, f25, f26, f27, f28, f29, f30, f31, f32, f33, f34, f35, f36, f37, f38, f39, f40, f41, f42, f43, f44, f45, f46, f47, f48, f49, f50, f51, f52, f53, f54, f55, f56, f57, f58, f59, f60, f61, f62, f63, f64, f65, f66, f67, f68, f69, f70, f71, f72, f73, f74, f75, f76, f77, f78, f79, f80, f81, f82, f83, f84, f85, f86, f87, f88, f89, f90, f91, f92, f93, f94, f95, f96, f97, f98, f99, f100, f101, f102, f103, f104, f105, f106, f107, f108, f109, f110, f111, f112, f113, f114, f115, f116, f117, f118

The following code removes the missing values from those columns. SML uses average value (median) of each column to replace the null values.

In [None]:
def __fillna__(__dataset):
	"""Fill null values with median of each col"""
	# PREPROCESSING-1
	_NUM_COLM_HAS_MISSING = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118']
	for _col in _NUM_COLM_HAS_MISSING:
		__dataset[_col] = __dataset[_col].fillna(__dataset[_col].median(),axis=0)
	return __dataset
__dataset = __fillna__(__dataset)

## Drop Target Column
We need to drop target column from the training dataset.

Now let's drops target columns of <b>claim</b> from dataset.

### Set Target Column
The target column in the value which we need to predict.
Therefore, we need to detach the target columns in prediction.
Please note that if we don't drop this fields, it will generate a model with high accuracy on training and worst accuracy on test (because the value in test dataset is Null).

In [None]:
target_column_pred = "claim"

In [None]:
# DETATCH TARGET
__feature = __dataset.drop(['claim'], axis=1)
__target =__dataset['claim']

## Split Train/Test
We have to separate train and test before start training a model

In [None]:
# TRAIN TEST SPLIT
__num_of_training_instances = __train_dataset.shape[0]
__feature_train = __feature.iloc[:__num_of_training_instances,:]
__feature_test = __feature.iloc[__num_of_training_instances:,:]
__target_train = __target.iloc[:__num_of_training_instances]
__target_test = __target.iloc[__num_of_training_instances:]
__feature_train

# Training Model and Prediction
First, we will train a model, then predict test values based on the trained model.

In [None]:
# MODEL
from lightgbm import LGBMRegressor
__lgbmregressor=LGBMRegressor()
__lgbmregressor.fit(__feature_train, __target_train)
__y_pred = __lgbmregressor.predict(__feature_test)

In [None]:
# Load target columns for prediction and generate submission file for the competition.
submission[target_column_pred]=__y_pred
submission

In [None]:
# Generate a submission CSV file; we have to save the file as a CSV and avoid adding dataframe index into it.
submission.to_csv('submission.csv', index=False)

In [None]:
# Now, let's review submission file.
submission