# To do 
* Forward imputation (iterate forward through null cols and impute)
* backward and forward impute (iterate forward + iterate backwards ) =>  find mean of both results 

# Import libraries and data 

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, power_transform

from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor

import gc 

In [None]:
MODEL = "hist" # "hist" "ridge" "LGB"
validation_col = 'F_1_7'   
EPOCHS = 5000

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv", index_col = 0)
sub = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv", index_col = 0)

# Basic data exploration

In [None]:
data.head()

In [None]:
print("Number of columns", len(data.columns))
print("Number of rows", len(data))

In [None]:
data.info()

In [None]:
data[["F_2_1","F_2_2"]].head()

In [None]:
data.describe()

There are definite groupings of columns, corresponding to the initial underscore value i.e. F_1, F_2, F_3, F_4

## F_ groupings 

In [None]:
F_1_cols = [col for col in data.columns if "F_1" in col]
F_2_cols = [col for col in data.columns if "F_2" in col]
F_3_cols = [col for col in data.columns if "F_3" in col]
F_4_cols = [col for col in data.columns if "F_4" in col]

In [None]:
data[F_1_cols].describe()

In [None]:
fig, ax = plt.subplots(3,5, figsize = (25,15) , sharey= True)
ax = ax.ravel()

for i, col in enumerate (F_1_cols):
    data[col].plot(ax = ax[i], kind = "hist", bins = 100)
    ax[i].set_title(f"{col}")
    
fig.suptitle("Histograms of F_1 (float columns)", fontsize=15)
plt.tight_layout()
plt.show()

#### Notes:
Most have a normal distribution: 
* Skew distributions = F_1_7 , F_1_12 ,  F_1_13    ----> potential correctness or additional features highlighting skewness

In [None]:
data[F_2_cols].describe()

In [None]:
fig, ax = plt.subplots(5,5, figsize = (25,15) )
ax = ax.ravel()

for i, col in enumerate (F_2_cols):
    sns.countplot(x= data[col], ax = ax[i])
    ax[i].set_title(f"{col}")
    
fig.suptitle("Countplots of F_2 columns (interger columns)", fontsize=15)
plt.tight_layout()
plt.show()

#### Notes: 
Not much to discuss here as we arent sure where these values originate. however:
* Each column starts at 0 so we can potentially assume these are encoded categorical columns
* Potential for grouping float columns according to interger columns (for imputation)

In [None]:
data[F_3_cols].describe()

In [None]:
fig, ax = plt.subplots(5,5, figsize = (25,20) , sharey= True)
ax = ax.ravel()

for i, col in enumerate (F_3_cols):
    data[col].plot(ax = ax[i], kind = "hist", bins = 100)
    ax[i].set_title(f"{col}")
    
fig.suptitle("Histograms of F_3 (float columns)", fontsize=15)
plt.tight_layout()
plt.show()

#### Notes:
* Skew distributions = F_3_19 ,F_3_21

In [None]:
data[F_4_cols].describe()

In [None]:
len(F_4_cols)

In [None]:
fig, ax = plt.subplots(3,5, figsize = (25,15) , sharey= True)
ax = ax.ravel()

for i, col in enumerate (F_4_cols):
    data[col].plot(ax = ax[i], kind = "hist", bins = 100)
    ax[i].set_title(f"{col}")
    
fig.suptitle("Histograms of F_4 (float columns)", fontsize=15)
plt.tight_layout()
plt.show()

#### Notes:
* Skew distributions = F_4_2 ,F_4_3, F_4_8, F_4_9, F_4_14

# Missing values

In [None]:
null_vals= data.isnull().sum()[data.isnull().sum()>0]
null_vals

In [None]:
null_vals.plot(kind = "bar", figsize = (25,7))

#### Notes: 
* F_2 columns dont have missing values

In [None]:
plt.figure(figsize= (20,10))
sns.heatmap(data.isnull().transpose(), cmap= "viridis")
plt.title("Missing Values")
plt.show()

# Relationships

## Correlation

In [None]:
data.corr()

In [None]:
plt.figure(figsize= (20,10))
sns.heatmap(data.corr(),vmin=-1, vmax= 1, cmap= "Spectral")
plt.title("Correlation of all columns")
plt.show()

Correlation in F_2 and F_4 columns 

In [None]:
plt.figure(figsize= (20,10))
sns.heatmap(data[F_2_cols].corr(),vmin=-1, vmax= 1, cmap= "Spectral")
plt.title("Correlation of F_2 columns")
plt.show()

In [None]:
plt.figure(figsize= (20,10))
sns.heatmap(data[F_4_cols].corr(),vmin=-1, vmax= 1, cmap= "Spectral")
plt.title("Correlation of F_4 columns")
plt.show()

# Imputation

### Mean Imputation

In [None]:
impute = SimpleImputer(strategy='mean',
    verbose=0,
    copy=False, #inplace
    add_indicator=False)

In [None]:
data_mean = impute.fit_transform(data)
data_mean = pd.DataFrame(data_mean, columns = data.columns)
data_mean

In [None]:
sub_mean = sub.copy(deep = True)
for i in sub_mean.index:
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sub_mean.loc[i, 'value'] = data_mean.loc[row, col]
sub_mean.to_csv("sub_mean.csv")
sub_mean

In [None]:
data_mean.isnull().sum()[data_mean.isnull().sum()>0]

# Validation  

In [None]:
impute = SimpleImputer(strategy='mean',
    verbose=0,
    copy=False, #inplace
    add_indicator=False)

##### Option: as seen above some columns arent normalised
We can try normalise these values 

In [None]:
# boxcox_data= power_transform(data.drop(validation_col,axis =1))
# boxcox_data = pd.DataFrame(boxcox_data, columns = data.columns )
#boxcox_data[validation_col] = data[validation_col] 

In [None]:
data_mean = impute.fit_transform(data.drop(validation_col,axis =1))
data_mean = pd.DataFrame(data_mean, columns = data.drop(validation_col,axis =1).columns)
data_mean[validation_col] = data[validation_col] 
data_mean

In [None]:
train = data_mean[~data_mean[validation_col].isnull()]
test = data_mean[data_mean[validation_col].isnull()].drop(validation_col,axis =1)

X = train.drop(validation_col,axis =1)
y = train[validation_col]

X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle = False)

## Models

In [None]:
if MODEL == "LGB":
    model = LGBMRegressor(n_estimators=EPOCHS,metric='rmse')
if MODEL == "ridge":
    model = Ridge()
if MODEL == "hist":
    model = HistGradientBoostingRegressor( max_iter=5000)

In [None]:
cv_out = model.fit(X,y)
preds = model.predict(X) 
test_preds = model.predict(test)

print("intrinsic: ",mean_squared_error(y,preds ))

In [None]:
fig, ax  = plt.subplots(1,2,figsize = (20,7))
ax[0].scatter (X.index , y)
ax[0].scatter (X.index , preds)
ax[0].set_title("test predictions")

ax[1].scatter (test.index , test_preds)
ax[1].set_title("test predictions")
plt.show()

# Full prediction

In [None]:
gc.collect()

In [None]:
# feat_importance = pd.DataFrame(index= data.columns)
# feats_col = pd.DataFrame(data = model.feature_importances_, index= train.drop(col,axis =1).columns, columns = [col])
# feat_importance.merge(feats_col, right_index = True, left_index = True)

In [None]:
def impute_cols(model_in ,data_in, impute_object ):
    
    feat_importance = pd.DataFrame(index= data.columns)
    
    data_in = data_in.copy(deep= True)
    
    total_mse = []
    for col in null_vals.index: 
        print(f"Running for {col}")
        data_imp = impute_object.fit_transform(data_in.drop(col,axis =1))
        data_imp = pd.DataFrame(data_imp, columns = data_in.drop(col,axis =1).columns)
        data_imp[col] = data_in[col] 
        
        train = data_imp[~data_imp[col].isnull()]
        test = data_imp[data_imp[col].isnull()].drop(col,axis =1)

        X = train.drop(col,axis =1)
        y = train[col]

        model_in.fit(X,y)
        preds = model_in.predict(X) 
        test_preds = model_in.predict(test)
        
        mse =mean_squared_error(y,preds )
        total_mse.append(mse)
        print("MSE: ",mse)
        data_in.loc[ test.index, col] = test_preds
        
        #visualize
        if MODEL == "LGB":
            #save feature importances
            feats_col = pd.DataFrame(data = model.feature_importances_, index= train.drop(col,axis =1).columns, columns = [col]).sort_values(ascending = False, by= [col] )
            feat_importance = feat_importance.merge(feats_col, right_index = True, left_index = True)
            
            #plot results and feature importance
            fig, ax  = plt.subplots(1,2,figsize = (25,5))
            ax[0].scatter(X.index , y)
            ax[0].scatter(X.index , preds)
            ax[0].set_title(f"{col} with MSE: {mse}")
            sns.barplot(ax = ax[1], y= feats_col[feats_col[col]!= 0][col], x= feats_col[feats_col[col]!= 0].index)
            ax[1].set_title(f"Feature importances")
            ax[1].tick_params(labelrotation=90)
            plt.show()
        else:
            plt.figure(figsize = (5,5))
            plt.scatter (X.index , y)
            plt.scatter (X.index , preds)
            plt.title(f"{col} with MSE: {mse}")
            plt.show()

        
    print("Mean MSE", np.array(total_mse).mean())
    
    return data_in ,total_mse, feat_importance

In [None]:
df_out ,total_mse ,feat_importance= impute_cols(model ,data, impute )
df_out.to_csv("data_imputed_out.csv")

In [None]:
#feat_importance.mean(axis =1).plot(kind = "bar")
plt.figure(figsize = (25,8))
feat_mean = feat_importance.mean(axis =1).sort_values(ascending = False )
sns.barplot( y= feat_mean.values, x= feat_mean.index)
plt.xticks(rotation = 90)
plt.title("Top feature importances")
plt.show()

In [None]:
MSE_vals = pd.DataFrame(null_vals)
MSE_vals[0] = total_mse
MSE_vals.to_csv("MSE_values.csv")

In [None]:
plt.figure(figsize = (25,7))
plt.bar(null_vals.index, total_mse)
plt.xticks(rotation = 90)
plt.show()

# Submission

In [None]:
for i in sub.index:
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sub.loc[i, 'value'] = df_out.loc[row, col]
sub.to_csv("submission.csv")