Setup MODIN

In [None]:
import os

os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask

import pandas as pd

Import Libraries

In [None]:
import numpy as np
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 80
pd.options.display.max_rows = 80
import seaborn as sns
%matplotlib inline
sns.set()
def dfy(s):
    return pd.DataFrame(s)

In [None]:
data = pd.read_pickle('cooked.pkl')

In [None]:
dm_1 = data[data.missing_cols==1]
dc = data[data.missing_cols==0]

#### Make datasets for single column learning<br>
 - Methods possible other than imputation:
 - PCA(dimensionality reduction)
 - feature elimination

Initial Steps<br>
 - [F]_clean stores indexes where each column is not missing
 - Separate them based on number of missing columns in each row

In [None]:
missing_ = data.isna().sum(axis=0)
missing_features = np.array(missing_[missing_ > 0].index)

    missing_features = array(['F_1_0', 'F_1_1', 'F_1_2', 'F_1_3', 'F_1_4', 'F_1_5', 'F_1_6',
       'F_1_7', 'F_1_8', 'F_1_9', 'F_1_10', 'F_1_11', 'F_1_12', 'F_1_13',
       'F_1_14', 'F_3_0', 'F_3_1', 'F_3_2', 'F_3_3', 'F_3_4', 'F_3_5',
       'F_3_6', 'F_3_7', 'F_3_8', 'F_3_9', 'F_3_10', 'F_3_11', 'F_3_12',
       'F_3_13', 'F_3_14', 'F_3_15', 'F_3_16', 'F_3_17', 'F_3_18',
       'F_3_19', 'F_3_20', 'F_3_21', 'F_3_22', 'F_3_23', 'F_3_24',
       'F_4_0', 'F_4_1', 'F_4_2', 'F_4_3', 'F_4_4', 'F_4_5', 'F_4_6',
       'F_4_7', 'F_4_8', 'F_4_9', 'F_4_10', 'F_4_11', 'F_4_12', 'F_4_13',
       'F_4_14'], dtype=object)

In [None]:
F_clean = {f:data[data[f].isna()==False].index for f in missing_features}
F_dirty = {f:data[data[f].isna()].index for f in missing_features}

In [None]:
missing_.sort_values(inplace=True)

#### Missing Data Distribution
![Image](missing_data_distribution.png)

#### Densities of the least missing features
##### F_1_10 and F_4_4
![Image](F_1_0density.png)![Image](F_4_4.png)

In [None]:
training_order = missing_.index

#### Next Steps 
 - Developed Training Order
 - need to examine predictability of lower features 

In [None]:
def feature_correlations(f_):
    clean_f110 =dc[f_].copy()
    # dc_corr = dc.corrwith(clean_f110, axis=1)
    # fig = sns.heatmap(dc_corr)
    correls_clean_f110 = {}
    for f in dc.columns:
        if f != f_:
            correls_clean_f110[f] = np.corrcoef(clean_f110,dc[f])[0][1]
    correls_clean_f110.pop('missing_cols')

    plt.figure(figsize=(18,18))
    fig = sns.barplot(y=list(correls_clean_f110.keys()),x=list(correls_clean_f110.values()), orient='h')

#### Feature correlations('F_1_10')
![Image](feature_correlations('F_1_10').png)

#### Feature correlations('F_4_4')
![Image](feature_correlations('F_4_4').png)

In [None]:
# Correlation in different scenarios
corr_data = data.drop(['missing_cols'],axis=1).corr()
corr_clean = dc.drop(['missing_cols'],axis=1).corr()

In [None]:
# percent_diff_in_correlation
pdic = (corr_data.abs() - corr_clean.abs())/corr_clean.abs()

In [None]:
# finding the columns with maximum and minimum change
correlation_variance_cols = {}
for f in pdic.columns:
    pdic_np = pdic[f].to_numpy()
    correlation_variance_cols[f] = {'max_col': percent_diff_in_correlation.columns[pdic_np.argmax()],
                               'min_col':percent_diff_in_correlation.columns[pdic_np.argmin()]}

In [None]:
dfy(correlation_variance_cols)

In [None]:
# finding the magitude
variation_magnitude={}
for f in pdic.columns:
    pdic_np = pdic[f].to_numpy()
    variation_magnitude[f] = {'max_col': pdic_np.max(),
                               'min_col':pdic_np.min()}

In [None]:
dfy(variation_magnitude)

In [None]:
sns.set()
fig = dfy(variation_magnitude).loc['min_col',:].abs().sort_values().plot(kind='barh',figsize=(10,10), title='Minimum Variation in Correlation  Observed',xlim=(0.6,1.1),colormap='inferno')

#### Correlations EDA
![Image](max_corr_var.png)
![Image](sdfs.png)


>Correlation in total Data vs Correlation in non-missing data slice varies heavily for certain features
>>Deviation_distance is needed for exploration

In [None]:
correlation_vector_distance = {}
for f in pdic.columns:
    pdic_np = pdic[f].to_numpy()
    correlation_vector_distance[f] = np.linalg.norm(pdic_np)**0.5

In [None]:
# dfy(correlation_vector_distance)
# correlation_vector_distance
plt.figure(figsize=(50,5))
# sns.color_palette = 'inferno'
sns.barplot(x = list(correlation_vector_distance.keys()),y=list(correlation_vector_distance.values()))
fig=sns.lineplot(x= list(correlation_vector_distance.keys()),y=np.ones(55)*7.5,color='red',linewidth = 1.5)

#### Deviation Distance
![Image](devd-Copy1.png)

In [None]:
len(correlation_vector_distance.keys())

In [None]:
np.ones(55)*7.5