In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Following steps from the popular book hands-on machine learning aimed on how we use machine learning in practice.

**Run cell below and ignore the rest to produce preprocessed data and proceed to modelling. Do not ignore what is below the code below if you want to see why I made the choices in the code.**

In [None]:
# To preprocess data, run this code
# X_train, y_train will be training input data and target labels
# X_test will be test input data
# y_test: store predictions here

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler  # use min max scaler (x- min)/(max - min)


path_to_data = "/kaggle/input/tabular-playground-series-sep-2021/"

sample_solution = pd.read_csv(path_to_data + "sample_solution.csv")
test_data = pd.read_csv(path_to_data + "test.csv")
train_data = pd.read_csv(path_to_data + "train.csv")

# unpack tdataset to retrieve last column of labels
X, y = train_data.iloc[: , :train_data.shape[1] - 1], train_data.iloc[:, train_data.shape[1]-1]
X_test = test_data

#fill in missing values with median
X = X.apply(lambda x: x.fillna(x.median()), axis=0)

# scale training data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)

#scale test data, independently of training data
scaler2 = MinMaxScaler()
X_scaled2 = scaler2.fit_transform(X_test)
X_scaled2 = pd.DataFrame(X_scaled2)

X_train = X_scaled
X_test = X_scaled2

X_train.drop(columns=[0], inplace=True)
X_test.drop(columns=[0], inplace=True)
#X_train.insert(0, 'id', X[X.columns[0]])
y_train = y

# do this if you want to drop the index column
#X.drop(['id'], axis=1)

# debug
#X_train.hist(bins=50, figsize=(25,20))
#plt.show()

The reason my code above looks like that is all of the below.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
path_to_data = "/kaggle/input/tabular-playground-series-sep-2021/"

sample_solution = pd.read_csv(path_to_data + "sample_solution.csv")
test_data = pd.read_csv(path_to_data + "sample_solution.csv")
train_data = pd.read_csv(path_to_data + "train.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
# unpack tdataset to retrieve last column of labels
X, y = train_data.iloc[: , :train_data.shape[1]-1], train_data.iloc[:, train_data.shape[1]-1]
X_test = test_data[test_data.columns[0]]

In [None]:
X.info()

All columns are numerical, there are no categorical columns. Had there been categorical columns, we would have used the .value_counts() method, which is applied to a column and outputs what categories exist and how many entries belong to each category.

In [None]:
X.hist(bins=50, figsize=(25,20))
plt.show()

In [None]:
# display summary statistics of the dataset
X.describe()  # excludes nan values

In [None]:
# count number of nan values per column
print("Count of nans per column")
print(X.isna().sum(axis=0))
print("Count of nans per row")
print(X.isna().sum(axis=1))
print(y.isna().sum(axis=0))  # label column has no nan values
print("Percentage of nan values per column is: ")
print(round((X.isna().sum(axis=0) / len(X)) * 100, 2))
print("The proportion of rows that contain nan values is ")
nan_row_counts = X.isna().sum(axis=1)
print(str(round(len(nan_row_counts[nan_row_counts > 0]) / len(X) * 100, 2)) + "%" )

There are quite a few missing values for a feature. To deal with that, three options:

**1.** Get rid of corresponding rows of that feature 

**2.** Get rid of entire feature.

**3.** Set the values to some value (zero, mean, median, etc.).

I wouldn't remove features for now, and I would also not remove rows that contain nan values, because that's 62% of our training data. Therefore, for me, the most sensible approach is to replace the values to either zero, mean, or median, or something else.

It doesn't make sense to me to replace the nans by the mean, because one can take a look at the features 51, 62 say, which have most of their values concentrated around 0 or around 1, so the mean will be in-between, possibly an outlier for that feature. 

In conclusion, in the following I will fill in any missing value in any feature column with the median of the column column it belongs to. Recall we have no categorical columns, all columns are numerical.

In [None]:
# fill in nan-s with mean
X = X.apply(lambda x: x.fillna(x.median()), axis=0)

# this prints 0 because there are no more nan values in the dataframe
print(X.isna().sum().sum())

**We are now ready to begin analysing the data. It has no missing values now.**

Feature scaling:

We can see from the histogram plots that the features have quite different ranges. 

In [None]:
from sklearn.preprocessing import MinMaxScaler  # use min max scaler (x- min)/(max - min)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)

In [None]:
X.hist(bins=50, figsize=(25,20))
X_scaled.hist(bins=50, figsize=(25,20))
plt.show()

In [None]:
X_train = X_scaled

**We are now ready to fit a model.**