In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import tqdm as tqdm
import multiprocessing

In [None]:
raw_data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")
sample_sub = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv")
sample_sub.head()

In [None]:
raw_data.shape

In [None]:
sample_sub.shape

# EDA

In [None]:
columns = raw_data.columns.to_list()
columns.remove('row_id')
columns

In [None]:
# Avg missing values per feature
print("average missing values per feature: ", raw_data.iloc[:,1:].isna().sum().mean() )
raw_data.iloc[:,1:].isna().sum()

In [None]:
raw_data[[col for col in raw_data.columns if 'F_2' in col]].describe()

In [None]:
# F_1 Features
raw_data[[col for col in raw_data.columns if 'F_1' in col]].plot.hist(subplots=True, legend = False, figsize = (10,10))
plt.show()
## Appear ~normally distributed with mean 0, sd 1
## Values between -5, 5

In [None]:
# F_2 Features
raw_data[[col for col in raw_data.columns if 'F_2' in col]].plot.hist(subplots=True, legend = False, figsize = (10,10))
plt.show()
## Not all normally distributed, some heavily skewed right (log normal?), none skewed left.
## Unskewed appear to be centered on 2.5, sd ~1.5
## Values between 0, 15 (ALL > 0)

In [None]:
# F_3 Features
raw_data[[col for col in raw_data.columns if 'F_3' in col]].plot.hist(subplots=True, legend = False, figsize = (10,10))
plt.show()
## ~Normally distributed with mean 0, sd 1.
## Values between -5, 5

In [None]:
# F_4 Features
raw_data[[col for col in raw_data.columns if 'F_4' in col]].plot.hist(subplots=True, legend = False, figsize = (10,10))
plt.show()
## ~Normally distributed with mean ~0, sd 2+ 
## Values between -13, 11 (much wider spread)

# Simple Rule-Based Baseline Test

In [None]:
def simple_impute(submission):
    """Imputes random value based on the assumed distribution the feature was generated from"""
    np.random.seed(123)
    for i, idx in enumerate(submission['row-col']):    
        row = int(idx.split('-')[0]) ##the row from the dataframe
        col = idx.split('-')[1]
        # Assign random value based on assumed distribution
        if 'F_1' in col:
            value = np.random.normal(loc=0.0, scale=1.0)
        elif 'F_2' in col:
            value = np.random.lognormal(mean=2.5, sigma=1.5)
        elif 'F_3' in col:
            value = np.random.normal(loc=0.0, scale = 1.0)
        elif 'F_4' in col:
            value = np.random.normal(loc=0.0, scale = 2)     
        # Replace 'value' with the chosen value
        submission.iloc[i, 1] = value
    return submission

In [None]:
#Break data into chunks
sub1 = sample_sub[0:100000].copy()
sub2 = sample_sub[100000:200000].copy()
sub3 = sample_sub[200000:300000].copy()
sub4 = sample_sub[300000:400000].copy()
sub5 = sample_sub[400000:500000].copy()
sub6 = sample_sub[500000:600000].copy()
sub7 = sample_sub[600000:700000].copy()
sub8 = sample_sub[700000:800000].copy()
sub9 = sample_sub[800000:900000].copy()
sub10 = sample_sub[900000:1000000].copy()
df_list = [sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10]

In [None]:
## Apply simple_impute across the chunks of the df
df_list = [df.pipe(simple_impute) for df in df_list]

In [None]:
results = pd.concat(df_list) ##concatenate list of dfs into one df
results[['group']] = "na"
results

## Inspect the dist of each feature classes imputations

In [None]:
#for i in results.index:
#    F = results.loc[i,'row-col']
    if 'F_1' in F:
        results.loc[i,'group'] = 'F_1'
    elif 'F_2' in F:
        results.loc[i,'group'] = 'F_2'
    elif 'F_3' in F:
        results.loc[i,'group'] = 'F_3'
    elif 'F_4' in F:
        results.loc[i,'group'] = 'F_4'

In [None]:
results.groupby('group')

## Final Submission

In [None]:
final_submission = results[['row-col', 'value']]
final_submission.shape
final_submission.to_csv('simple_benchmark.csv', index=False)

---

In [None]:
import time
##compare time to impute two sub-dfs between simple pipe and multiprocess

start = time.time()
df_list = [df.pipe(simple_impute) for df in df_list[0, 1]]
end = time.time()
print(end - start)

In [None]:
start = time.time()
if __name__ == '__main__':
    p = multiprocessing.Pool(processes=4)
    data = p.map(simple_impute, df_list[0])
    p.close()
    result = data
end = time.time()
print(end - start)

In [None]:
submission[[f for f in submission[['row-col']] if 'F_1' in f]].plot.hist(subplots=True, legend = False)