<h1 align="center"> EDA + Modelling (using tabular data only) </h1>

<div align="right-justify">Actually i am making this notebook for beginners. Actually i was receiving some queries how to participate in live competitions. I am using tabular data only and calculating predictions based only upon that. I predicted on test data. You can move forward after using this notebook. I have made this notebook after taking some help from other kagglers notebooks and tried to make this notebook as simple as possible.</div><br> Thanks

In [None]:
import os
import tempfile

import numpy as np
import pandas as pd
from scipy import stats
from IPython.display import display

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
sns.set_palette('deep')
mpl.rcParams['figure.figsize'] = (9,12)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
train_csv="../input/osic-pulmonary-fibrosis-progression/train.csv"
data = {'train_data' : pd.read_csv(train_csv, index_col='Patient',
                                    dtype={'Patient':'object',
                                            'Weeks':np.int16,
                                            'FVC':np.int32,
                                            'Percent':np.float64,
                                            'Age':np.int8,
                                            'Sex':'category',
                                            'SmokingStatus':'category'})}

In [None]:
#data_boolean: One hot encoded columns
#data_numeric: Numerical data values
data['numeric'] = data['train_data'].drop(columns=['Sex', 'SmokingStatus'])

In [None]:
# **OneHotEncoding** applayed to column SmokingStatus
# **Binarizing** column sex to. True if is male, False otherwise
ohe = sklearn.preprocessing.OneHotEncoder(sparse=False, dtype=np.bool)
smoker_array = ohe.fit_transform(data['train_data'][['SmokingStatus']])
smoker_stack = np.stack(smoker_array, 1)
data['boolean'] = pd.DataFrame().assign(isMale = data['train_data'].Sex=='Male',
                                    currently_smokes = smoker_stack[0],
                                    ex_smoker = smoker_stack[1],
                                    never_smoked = smoker_stack[2])

In [None]:
# Normalization
# data_normalized: Normalized and encoded data
scaler = sklearn.preprocessing.RobustScaler()  # To normalize data
data['normalized'] = scaler.fit_transform(data['numeric']) # Normalize
data['normalized'] = pd.DataFrame(data['normalized'],
                            data['numeric'].index,
                            data['numeric'].columns)

In [None]:
# dataframe summaries
display(ohe.categories_)  # smoker categories
display(data['normalized'].describe())  # numeric variables
display(data['boolean'].describe())  # As bool 
display(data['boolean'].astype(np.int8).describe())  # booleans as numeric

> **Questions to answer:**
<br>

*Shape of data*<br>
*is it balanced?<br>
Correlation: Weeks-FVC<br>
Prevalecence: Age - Weeks<br>
Prevalecence: Age - FVC<br>
Prevalecence: SmokingStatus - Percent<br>*

In [None]:
sns.violinplot(x='SmokingStatus', y='FVC',
                hue='Sex', split=True,
                data=data['train_data'],
                inner="quart", linewidth=1,
                palette={'Male': ".75", 'Female': "pink"},
                ci="sd", alpha=.6, height=12)

In [None]:
sns.pairplot(data['train_data'], hue='Sex')

In [None]:
# Statistics
# Bootstrap sampling function: multiple_balanced_sampling
def multiple_balanced_sampling(data, group_by, epochs=100, samples=1_000):
    def item_to_front(ls, value):
        list_ = list(ls)
        try:
            val_index = list_.index(value)
            list_.insert(0, list_.pop(val_index))
        except ValueError:
            list_.insert(0, group_by)
        return list_
    
    col_names = item_to_front(data.columns, group_by)
    def sample_balanced(data, categories = set(data[group_by])):
        def sampler(data, category, col_names=col_names):
            sub_df = data[data[group_by]==category]
            sample = sub_df.sample(samples, replace=True)
            return sample.groupby([group_by]).mean().reset_index()
        return np.vstack([sampler(data, cat) for cat in categories])
    samples_list = np.vstack([sample_balanced(data) for i in range(epochs)])
    return pd.DataFrame(samples_list, columns=col_names).dropna()

In [None]:
samples = 1500      # {type:"slider", min:100, max:10000, step:100}
epochs = 100        # {type:"slider", min:10, max:1000, step:10}
category = "SmokingStatus" # ["Sex", "SmokingStatus"] {allow-input: true}
#
key_ = "bs_" + category
data[key_] = data['numeric'].copy()
data[key_] = data['numeric'].join(data['train_data'][category])
data[key_] = multiple_balanced_sampling(data[key_], category, samples, epochs)
# # correcting dtypes
data[key_] = data[key_].infer_objects()
data[key_][category] = data[key_][category].astype('category')
# viz
sns.pairplot(data[key_], hue=category)

In [None]:
samples = 1300      # {type:"slider", min:100, max:10000, step:100}
epochs = 110        # {type:"slider", min:10, max:1000, step:10}
category = "Sex"    # ["Sex", "SmokingStatus"] {allow-input: true}
#
key_ = "bs_" + category
data[key_] = data['numeric'].copy()
data[key_] = data['numeric'].join(data['train_data'][category])
data[key_] = multiple_balanced_sampling(data[key_], category, samples, epochs)
# # correcting dtypes
data[key_] = data[key_].infer_objects()
data[key_][category] = data[key_][category].astype('category')
# viz
sns.pairplot(data[key_], hue=category)

In [None]:
normalized_data = True # {type:"boolean"}
if normalized_data:
    data_source = "normalized"
else:
    data_source = "train_data"
method_1 = "kendall" #["spearman", "kendall", "pearson"]
corr_bs_1 = data[data_source].corr(method_1)
corr_bs_1 = corr_bs_1.abs()

method_2 = "spearman" #["spearman", "kendall", "pearson"]
corr_bs_2 = data[data_source].corr(method_2)
corr_bs_1 = corr_bs_1.abs()

f, axis = plt.subplots(1, 2, figsize=(10, 4))
axis[0].set_title("Samples balanced by Sex")
sns.heatmap(corr_bs_1, cmap='gray', ax = axis[0])
axis[1].set_title("Samples balanced by SmokingStatus")
sns.heatmap(corr_bs_2, cmap='gray', ax = axis[1])

In [None]:
data

In [None]:
# Again loading
traindf = pd.read_csv(train_csv)
traindf

In [None]:
traindf.drop(['Percent'], axis=1, inplace=True)
traindf.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])

# Create a list of unique patients
patients = list(traindf['Patient'].unique())
len(patients)

In [None]:
# Find and append the baseline FVC for each patient

for i in patients:
    base_fvc = traindf.loc[(traindf['Patient'] == i) & 
                           (traindf['Weeks'] == min(traindf[traindf['Patient'] == i]['Weeks'], key=abs)),['FVC']]
    traindf.loc[(traindf['Patient'] == i),'base_fvc'] = int(np.asarray(base_fvc))

In [None]:

features = traindf[['Patient', 'Sex', 'SmokingStatus']]
features = pd.get_dummies(features, columns=['Sex', 'SmokingStatus'])

features['age'] = (traindf['Age'] - traindf['Age'].min()) / \
                      (traindf['Age'].max() - traindf['Age'].min())

features['weeks'] = (traindf['Weeks'] - traindf['Weeks'].min()) / \
                      (traindf['Weeks'].max() - traindf['Age'].min())

features['base_fvc'] = (traindf['base_fvc'] - traindf['base_fvc'].min()) / \
                      (traindf['base_fvc'].max() - traindf['base_fvc'].min())
    
features['fvc'] = traindf['FVC']
features

In [None]:
features.to_pickle('tabular_features.pkl')

In [None]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import train_test_split
train_inds, test_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 42).split(features, groups=features['Patient']))
train = features.iloc[train_inds]
test = features.iloc[test_inds]
train.drop('Patient', axis=1, inplace=True)
test.drop('Patient', axis=1, inplace=True)
X_train = train.loc[:,:'weeks']
y_train = train['fvc']
X_test = test.loc[:,:'weeks']
y_test = test['fvc']

In [None]:
X_train

In [None]:
X = features[['Sex_Female', 'Sex_Male', 'SmokingStatus_Currently smokes', 'SmokingStatus_Ex-smoker', 'SmokingStatus_Never smoked', 'age', 'weeks']]
y = features['fvc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train

In [None]:
# Build the model
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
model = models.Sequential()

model.add(layers.Dense(64, activation='relu', input_shape=(7,)))
model.add(layers.Dense(32, activation='relu'))
#model.add(layers.Dropout(0.3))
#model.add(layers.Dense(16, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(0)))
model.add(layers.Dense(1, activation='linear'))

# kernel_regularizer=tf.keras.regularizers.l2(0.001)
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
             loss='mape',
             metrics=['mae', 'mse', 'mape'])

In [None]:
# Specify callbacks for tensorboard
from datetime import datetime
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Specify early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, mode='auto', restore_best_weights=True)

In [None]:
history = model.fit(X_train, y_train, epochs=300,
                    validation_data=(X_test, y_test),
                    callbacks=[tensorboard_callback],
                    verbose=False)

In [None]:
predictions = model.predict(X_test, batch_size=1)
predictions
a = plt.axes(aspect='equal')
plt.scatter(predictions, y_test, edgecolors=(0, 0, 0))
plt.xlabel('True Values')
plt.ylabel('Predictions')
lims = [1500, 4000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)