In [90]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import mixture
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from aitia_explorer.app import App

# stop the warning clutter
import warnings
warnings.filterwarnings('ignore')

In [49]:
def get_gmm_sample_data(incoming_df, column_list, sample_size):
    """
    Unsupervised Learning in the form of BayesianGaussianMixture to create sample data.
    """
    gmm = mixture.BayesianGaussianMixture(n_components=2,
                                          covariance_type="full",
                                          n_init=100,
                                          random_state=42).fit(incoming_df)
    clustered_data = gmm.sample(sample_size)
    clustered_df = pd.DataFrame(clustered_data[0], columns=column_list)
    return clustered_df

def get_synthetic_training_data(incoming_df):
    """
    Creates synthetic training data by sampling from a BayesianGaussianMixture supplied distribution.
    Synthetic data is then labelled differently from the original data.
    """
    # number of records in df
    number_records = len(incoming_df.index)

    # get sample data from the unsupervised BayesianGaussianMixture
    df_bgmm = get_gmm_sample_data(incoming_df, list(incoming_df), number_records)

    # set the class on the samples
    df_bgmm['original_data'] = 0

    # add the class to a copy of incoming df, stops weird errors due to changed dataframes
    working_df = incoming_df.copy(deep=True)
    working_df['original_data'] = 1

    # concatinate the two dataframes
    df_combined = working_df.append(df_bgmm, ignore_index=True)

    # shuffle the data
    df_combined = df_combined.sample(frac=1)

    # get the X and y
    x = df_combined.drop(['original_data'], axis=1).values
    y = df_combined['original_data'].values
    y = y.ravel()

    return x, y

In [4]:
aitia = App()

In [5]:
df = aitia.data.hepar2_10k_data()

In [50]:
# get ths synthetic data
X, y = get_synthetic_training_data(df)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [95]:
models = [LinearRegression, 
          SGDClassifier, 
          RandomForestClassifier, 
          GradientBoostingClassifier, 
          XGBClassifier]

In [94]:
model_results = dict()
for model in models:
    current_model = model()
    # fit the model
    current_model.fit(X_train, y_train) 
    # score
    model_results[type(current_model).__name__] = [current_model.score(X_test, y_test)]

model_df = pd.DataFrame(model_results)
model_df

Unnamed: 0,LinearRegression,SGDClassifier,RandomForestClassifier,GradientBoostingClassifier,XGBClassifier
0,-0.008803,0.5188,1.0,0.9994,1.0
