### Experiments using IMLY ###

This notebook contains experimental runs of IMLY with different datasets.  
The readings of these experiments can be referred to in this [sheet](https://docs.google.com/spreadsheets/d/1E5jcq2w42gN8bMIaeaRJpAdhgSVN-2XDJ_YTHe4qfwY/edit?usp=sharing).

---

### Dataset  #1

#### Diabetes dataset

In [1]:
import experiment_automation_script
from os import path
import pandas as pd

dataset_info = experiment_automation_script.get_dataset_info("diabetes")
url = "../data/diabetes.csv" if path.exists("../data/diabetes.csv") else dataset_info['url']
data = pd.read_csv(url, delimiter=",", header=None, index_col=False)

X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

experiment_automation_script.dopify(dataset_info, 'linear_regression', X, Y, 0.60)

Using TensorFlow backend.


From talos.py ---  {'lr': [2, 10, 30], 'first_neuron': [1], 'batch_size': [10], 'epochs': [10], 'weight_regulizer': [None], 'emb_output_dims': [None], 'optimizer': ['nadam'], 'losses': ['mae', 'mse'], 'activation': ['linear'], 'model_name': ['SklearnKerasRegressor']}


100%|██████████| 3/3 [00:05<00:00,  2.29s/it]


Scan Finished!
Epoch 1/1


---

### Dataset  #2

#### UCI Abalone dataset

In [1]:
import experiment_automation_script
import pandas as pd
import numpy as np
from os import path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


dataset_info = experiment_automation_script.get_dataset_info("uci_abalone")

names = ["sex", "length", "diameter", "height", "whole weight",
        "shucked weight", "viscera weight", "shell weight", "rings"]
url = "../data/abalone.data.csv" if path.exists("../data/abalone.data.csv") else dataset_info['url']
data = pd.read_csv(url, delimiter=",", header=None, names=names, index_col=False)
data.head()

# Check for columns that contain missing values #
col_names = data.columns

num_data = data.shape[0]

categorical_col = ['sex']
for col in categorical_col:
    b, c = np.unique(data[col], return_inverse=True)
    data[col] = c

    
# Filter dataset to contain 'rings' 9 and 10 #
data = data[data['rings'].isin([9,10])]
data['rings'] = data['rings'].map({9: 0, 10: 1})


feature_list = names[:7]
X = data.loc[:, feature_list]
Y = data[['rings']]


experiment_automation_script.dopify(dataset_info, 'logistic_regression', X, Y, 0.60)

Using TensorFlow backend.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
100%|██████████| 1/1 [00:06<00:00,  6.34s/it]


Scan Finished!
Epoch 1/1


---

### Dataset  #3

#### UCI Iris dataset

In [5]:
import experiment_automation_script
import pandas as pd
import numpy as np
from os import path

dataset_name = "uci_iris"
dataset_info = experiment_automation_script.get_dataset_info(dataset_name)

url = "../data/iris.csv" if path.exists("../data/iris.csv") else dataset_info['url']
data = pd.read_csv(url , delimiter=",", header=None, index_col=False)
class_name,index = np.unique(data.iloc[:,-1],return_inverse=True)
data.iloc[:,-1] = index
data = data.loc[data[4] != 2]
X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

experiment_automation_script.dopify(dataset_info, 'logistic_regression', X, Y, 0.60)

100%|██████████| 1/1 [00:14<00:00, 14.90s/it]


Scan Finished!
Epoch 1/1


---

### Dataset  #4

#### UCI Adult salary dataset

In [6]:
import experiment_automation_script
import pandas as pd
import numpy as np
from os import path

dataset_name = "uci_adult_salary"
dataset_info = experiment_automation_script.get_dataset_info(dataset_name)


names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
         'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
         'hours-per-week', 'native-country', 'target']
url = "../data/iris.csv" if path.exists("../data/dataset.csv.csv") else dataset_info['url']
data = pd.read_csv(url, delimiter=" ", header=None, names=names)


data = data[data["workclass"] != "?"]
data = data[data["occupation"] != "?"]
data = data[data["native-country"] != "?"]

# Convert categorical fields #
categorical_col = ['workclass', 'education', 'marital-status', 'occupation',
                   'relationship', 'race', 'sex', 'native-country', 'target']

for col in categorical_col:
    b, c = np.unique(data[col], return_inverse=True)
    data[col] = c

feature_list = names[:14]
# Test train split #
X = data.loc[:, feature_list]
Y = data[['target']]

experiment_automation_script.dopify(dataset_info, 'logistic_regression', X, Y, 0.60)

# Split the dataset into test and train datasets

# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
100%|██████████| 1/1 [00:33<00:00, 33.26s/it]


Scan Finished!
Epoch 1/1


---

### Dataset  #5

#### UCI Ad dataset

In [8]:
import experiment_automation_script
import pandas as pd
import numpy as np
from os import path
from sklearn.preprocessing import LabelEncoder

dataset_name = "uci_ad"
dataset_info = experiment_automation_script.get_dataset_info(dataset_name)

url = "../data/ad.data.csv" if path.exists("../data/dataset.csv.csv") else dataset_info['url']
data = pd.read_csv(url, delimiter=",", header=None, index_col=False)

# Check for columns that contain missing values #

data = data.applymap(lambda val: np.nan if str(val).strip() == '?' else val)
data = data.dropna()


# Label encoding #

lb = LabelEncoder()
Y = lb.fit_transform(data.iloc[:, -1])

X = data.iloc[:,:-1]

# Normalize the X values #
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

experiment_automation_script.dopify(dataset_info, 'logistic_regression', X, Y, 0.60)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
100%|██████████| 1/1 [00:02<00:00,  2.51s/it]


Scan Finished!
Epoch 1/1


---

### Dataset  #6

#### UCI Mushroom dataset

In [3]:
from sklearn.model_selection import train_test_split
import copy

model_name = 'linear_regression'
model_mappings = {
    'linear_regression': 'LinearRegression',
    'logistic_regression': 'LogisticRegression'
}

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=0)

for key, value in model_mappings.items():
    if key == model_name:
        name = value

module = __import__('sklearn.linear_model', fromlist=[name])
imported_module = getattr(module, name)
model = imported_module

primal_model = model()

# Primal
primal_model.fit(x_train, y_train)
primal_model.__class__.__name__

'LinearRegression'

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

diabetes = datasets.load_diabetes()

# Use only one feature
diabetes_X = diabetes.data
# sc = StandardScaler()
# diabetes.data = sc.fit_transform(diabetes.data)

X = diabetes.data
Y = diabetes.target

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=0)

# # Split the data into training/testing sets
# x_train = diabetes_X[:-20]
# x_test = diabetes_X[-20:]

# # Split the targets into training/testing sets
# y_train = diabetes.target[:-20]
# y_test = diabetes.target[-20:]


In [23]:
# temp_data = np.column_stack([X,Y])
# np.savetxt("diabetes.csv", temp_data, delimiter=",")

In [3]:
import winmltools
winmltools.__file__

'C:\\Users\\shakk\\Anaconda2\\envs\\py36\\lib\\site-packages\\winmltools\\__init__.py'

In [9]:
import onnxmltools

def f1(**kwargs):
    params_json = json.load(open('../imly/architectures/sklearn/params.json'))
    params = params_json['params']
    kwargs.setdefault('params', params)
    kwargs.setdefault('x_train', np.array([[1], [2]]))

    model = Sequential()
    model.add(Dense(kwargs['params']['first_neuron'],
                    input_dim=kwargs['x_train'].shape[1],
                    activation=kwargs['params']['activation']))

    model.compile(optimizer=kwargs['params']['optimizer'],
                  loss=kwargs['params']['losses'],
                  metrics=['acc'])
    onnx_model = onnxmltools.convert_keras(model, target_opset=8)
    print(type(model))
    onnx_model
    return onnx_model

In [10]:
import json
import numpy as np
from keras import Sequential
from keras.layers import Dense

model = f1()

The maximum opset needed by this model is only 7.


<class 'keras.engine.sequential.Sequential'>


In [19]:
type(model)
# cross check import (f1p1 and f2p2 combination) - Is it possible to edit after the export-import flow

onnx.onnx_ml_pb2.ModelProto

In [18]:
import onnx
onnx.save(model, './onnx_model.onnx')