In [43]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import mixture
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from aitia_explorer.app import App

# stop the warning clutter
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_gmm_sample_data(incoming_df, column_list, sample_size):
    """
    Unsupervised Learning in the form of BayesianGaussianMixture to create sample data.
    """
    gmm = mixture.BayesianGaussianMixture(n_components=2,
                                          covariance_type="full",
                                          n_init=100,
                                          random_state=42).fit(incoming_df)
    clustered_data = gmm.sample(sample_size)
    clustered_df = pd.DataFrame(clustered_data[0], columns=column_list)
    return clustered_df

def get_synthetic_training_data(incoming_df):
    """
    Creates synthetic training data by sampling from a BayesianGaussianMixture supplied distribution.
    Synthetic data is then labelled differently from the original data.
    """
    # number of records in df
    number_records = len(incoming_df.index)

    # get sample data from the unsupervised BayesianGaussianMixture
    df_bgmm = get_gmm_sample_data(incoming_df, list(incoming_df), number_records)

    # set the class on the samples
    df_bgmm['original_data'] = 0

    # add the class to a copy of incoming df, stops weird errors due to changed dataframes
    working_df = incoming_df.copy(deep=True)
    working_df['original_data'] = 1

    # concatinate the two dataframes
    df_combined = working_df.append(df_bgmm, ignore_index=True)

    # shuffle the data
    df_combined = df_combined.sample(frac=1)

    # get the X and y
    x = df_combined.drop(['original_data'], axis=1).values
    y = df_combined['original_data'].values
    y = y.ravel()

    return x, y

In [3]:
aitia = App()

In [5]:
df = aitia.data.hepar2_10k_data()

In [6]:
# get ths synthetic data
X, y = get_synthetic_training_data(df)

In [16]:
estimator = GradientBoostingClassifier()
selector = RFE(estimator, n_features_to_select=5, step=3)
selector = selector.fit(X, y)

In [17]:
# sort the feature indexes
feature_indices = []

for i in range(df.shape[1]):
    # see if column has been marked true or false
    if selector.support_[i]:
        feature_indices.append(i)

requested_features = [list(df)[i] for i in feature_indices]

In [18]:
df_reduced = df[requested_features]
df_reduced

Unnamed: 0,RHepatitis,proteins,hbsag_anti,consciousness,hbeag
0,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
1,-0.167845,7.365366,-0.142857,-0.164669,-0.060108
2,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
3,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
4,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
...,...,...,...,...,...
9995,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
9996,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
9997,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
9998,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108


In [35]:
lr = LinearRegression()
lr.fit(X, y) 
coefs = lr.coef_
features = np.argsort(coefs)[::-1]
features

array([37,  2, 47, 33, 19, 68, 43,  1, 31, 28,  4,  3, 26, 62, 69, 45, 38,
       32, 11,  6, 41, 48, 16,  0, 64,  9, 58, 56, 60, 18, 36, 39, 15, 63,
       66, 30, 42, 12, 22, 52, 44,  7,  5, 24, 53, 55, 54, 59, 34, 10, 61,
       20, 57, 67,  8, 35, 49, 25, 21, 40, 27, 51, 23, 29, 17, 13, 50, 14,
       46, 65])

In [38]:
requested_features = [list(df)[i] for i in features[:5]]
df_reduced = df[requested_features]
df_reduced

Unnamed: 0,platelet,hepatotoxic,density,joints,Hyperbilirubinemia
0,-0.077471,-0.296683,1.250115,-0.352817,-0.282896
1,-0.077471,-0.296683,-0.799926,2.834329,-0.282896
2,-0.077471,-0.296683,-0.799926,-0.352817,-0.282896
3,-0.077471,-0.296683,-0.799926,-0.352817,3.534866
4,2.781241,-0.296683,-0.799926,-0.352817,-0.282896
...,...,...,...,...,...
9995,-0.077471,-0.296683,-0.799926,-0.352817,-0.282896
9996,-0.077471,-0.296683,-0.799926,-0.352817,-0.282896
9997,-0.077471,-0.296683,-0.799926,-0.352817,-0.282896
9998,-0.077471,-0.296683,-0.799926,-0.352817,-0.282896


In [47]:
sgd = SGDClassifier()
sgd.fit(X, y) 
coefs = sgd.coef_
features = np.argsort(coefs)[::-1][0]

In [48]:
requested_features = [list(df)[i] for i in features[:5]]
df_reduced = df[requested_features]
df_reduced

Unnamed: 0,ascites,ChHepatitis,ast,flatulence,RHepatitis
0,-0.393024,-0.433793,-0.982531,-0.853236,-0.167845
1,2.544374,-0.433793,0.118714,1.172009,-0.167845
2,-0.393024,-0.433793,-0.982531,1.172009,-0.167845
3,-0.393024,-0.433793,1.219959,1.172009,-0.167845
4,-0.393024,1.450626,0.118714,-0.853236,-0.167845
...,...,...,...,...,...
9995,-0.393024,-0.433793,-0.982531,1.172009,-0.167845
9996,-0.393024,-0.433793,-0.982531,-0.853236,-0.167845
9997,-0.393024,-0.433793,1.219959,-0.853236,-0.167845
9998,-0.393024,-0.433793,-0.982531,-0.853236,-0.167845


In [45]:
rfc = RandomForestClassifier()
rfc.fit(X, y) 
features = np.argsort(rfc.feature_importances_)[::-1]

In [46]:
requested_features = [list(df)[i] for i in features[:5]]
df_reduced = df[requested_features]
df_reduced

Unnamed: 0,proteins,hcv_anti,encephalopathy,THepatitis,Cirrhosis
0,-0.135771,-0.04363,-0.174347,-0.21094,-0.276595
1,7.365366,-0.04363,-0.174347,-0.21094,-0.276595
2,-0.135771,-0.04363,-0.174347,-0.21094,-0.276595
3,-0.135771,-0.04363,-0.174347,-0.21094,-0.276595
4,-0.135771,-0.04363,-0.174347,-0.21094,-0.276595
...,...,...,...,...,...
9995,-0.135771,-0.04363,-0.174347,-0.21094,-0.276595
9996,-0.135771,-0.04363,-0.174347,-0.21094,-0.276595
9997,-0.135771,-0.04363,-0.174347,-0.21094,-0.276595
9998,-0.135771,-0.04363,-0.174347,-0.21094,-0.276595
