In [11]:
import pandas as pd
import numpy as np
import mlexpt
import json
import os

In [3]:
config = json.load(open('../configs/LR_fruit_expt_set0.json', 'r'))
config

{'model': {'qualitative_features': ['color', 'shape'],
  'binary_features': ['ripe'],
  'quantitative_features': ['radius', 'weight'],
  'target': 'fruit',
  'embedding_schemes': {},
  'algorithm': 'LogisticRegression',
  'model_parameters': {'device': 'cpu', 'nb_epoch': 100}},
 'data': {'path': 'exampledataset/fruits.json',
  'missing_value_filling': {},
  'torchdevice': 'cpu'},
 'train': {'cross_validation': True,
  'cv_nfold': 5,
  'heldout_fraction': 0.05,
  'persist_model': False,
  'model_path': None},
 'statistics': {'topN': 5,
  'compute_class_performance': True,
  'class_performances_excel': None}}

In [16]:
## model config
qual_features = config['model'].get('qualitative_features', [])
binary_features = config['model'].get('binary_features', [])
quant_features = config['model'].get('quantitative_features', [])
dr_config = config['model'].get('embedding_schemes', {})
labelcol = config['model']['target']
algorithm = config['model']['algorithm']
model_param = config['model']['model_parameters']
## cross validation setup
do_cv = config['train']['cross_validation']
cv_nfold = config['train']['cv_nfold']
heldout_fraction = config['train']['heldout_fraction']
to_persist_model = config['train']['persist_model']
final_model_path = config['train']['model_path']
# data
datapath = os.path.join('..', config['data']['path'])
missing_val_default = config['data']['missing_value_filling']
data_device = config['data']['torchdevice']
# statistics
topN = config['statistics']['topN']
to_compute_class_performances = config['statistics'].get('compute_class_performance', False)
class_performance_excel_file = config['statistics'].get('class_performances_excel', None)

In [8]:
from mlexpt.data.adding_features import adding_no_features
from mlexpt.experiment import add_multiple_features

feature_adder = add_multiple_features([adding_no_features])

In [17]:
from mlexpt.data.dataload import process_data

tempdir, nbdata = process_data(datapath,
                               qual_features, binary_features, quant_features,
                               labelcol,
                               feature_adder=feature_adder,
                               nb_lines_per_tempfile=500,
                               missing_val_default=missing_val_default)
print(tempdir.name)
print(nbdata)

/var/folders/pq/_srx84qj30j_byd7sxppfg940000gn/T/tmp2o7u8b58
10000


In [19]:
from mlexpt.data.dataload import iterate_json_files_directory
from mlexpt.utils.core import generate_columndict

label2idx, idx2label = generate_columndict(iterate_json_files_directory(tempdir.name),
                                          [labelcol], [], [])
target_label_dict = {key[len(labelcol)+1:]: value for key, value in label2idx.items()}

	Qualitative Feature: fruit, number of distinct of values: 


In [22]:
from mlexpt.utils.embeddings import embed_features

dimred_dict = embed_features(dr_config,
                                 [datum
                                  for datum in iterate_json_files_directory(tempdir.name,
                                                                            columns_to_keep=list(dr_config.keys())
                                                                            )
                                  ]
                                 )

In [23]:
from mlexpt.utils.datatransform import generate_columndict_withembeddings

feature2idx, idx2feature = generate_columndict_withembeddings(iterate_json_files_directory(tempdir.name),
                                                              qual_features,
                                                              binary_features,
                                                              quant_features,
                                                              dimred_dict)

	Qualitative Feature: color, number of distinct of values: 
	Qualitative Feature: shape, number of distinct of values: 
	Binary / Quantitative feature: ripe
	Binary / Quantitative feature: radius
	Binary / Quantitative feature: weight


In [24]:
overall_performances = []
top_results_by_class = []
weighted_results_by_class = []
hit_results_by_class = []

In [25]:
from mlexpt.utils.datatransform import NumericallyPreparedDataset

In [27]:
from mlexpt.data.dataload import assign_partitions

partitions = assign_partitions(nbdata, cv_nfold, heldout_fraction)

In [28]:
cv_round = 0

In [29]:
train_dataset = NumericallyPreparedDataset(iterate_json_files_directory(tempdir.name),
                                           feature2idx,
                                           qual_features,
                                           binary_features,
                                           quant_features,
                                           dimred_dict,
                                           labelcol,
                                           label2idx,
                                           assigned_partitions=partitions,
                                           interested_partitions=[partition
                                                                  for partition in range(cv_nfold)
                                                                  if partition != cv_round],
                                           device=data_device
                                           )

In [30]:
from mlexpt.ml.linear import MulticlassLogisticRegression

model = MulticlassLogisticRegression(**model_param)
model.fit(train_dataset.X if isinstance(train_dataset.X, np.ndarray) else train_dataset.X.toarray(),
          train_dataset.Y if isinstance(train_dataset.Y, np.ndarray) else train_dataset.Y.toarray()
          )

  0%|          | 0/100 [00:00<?, ?it/s]

Logistic regression trained on: cpu


100%|██████████| 100/100 [00:00<00:00, 153.77it/s]


In [33]:
test_dataset = NumericallyPreparedDataset(iterate_json_files_directory(tempdir.name),
                                                      feature2idx,
                                                      qual_features,
                                                      binary_features,
                                                      quant_features,
                                                      dimred_dict,
                                                      labelcol,
                                                      label2idx,
                                                      assigned_partitions=partitions,
                                                      interested_partitions=[cv_round],
                                                      device=data_device
                                                      )

In [34]:
predicted_Y = model.predict_proba(test_dataset.X if isinstance(test_dataset.X, np.ndarray) else test_dataset.X.toarray())

In [55]:
from mlexpt.metrics.statistics import extracting_stats_run

overall_performance, top_result_by_class, weighted_result_by_class, hit_result_by_class = \
    extracting_stats_run(predicted_Y,
                         test_dataset.Y if isinstance(test_dataset.Y, np.ndarray) else test_dataset.Y.toarray(),
                         target_label_dict,
                         topN)

		Calculating metrics per class...


In [56]:
overall_performance

{'nb_testdata': 107,
 'nb_equals': 87,
 'nb_weighted_equals': 141.0,
 'nb_hits': 212,
 'top1_accuracy': 0.8130841121495327,
 'weighted_accuracy': 1.3177570093457944,
 'hit_accuracy': 1.9813084112149533,
 'topN': 5}

In [57]:
testY_labelindices = np.argmax(test_dataset.Y if isinstance(test_dataset.Y, np.ndarray) else test_dataset.Y.toarray(), axis=1)
modeloutputY_labelindices = np.argmax(predicted_Y, axis=1)
sorted_index_matrix = np.argsort(predicted_Y, axis=1)

In [41]:
testY_labelindices

array([0, 2, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 1, 2, 2, 1, 1, 0, 1, 2, 0,
       0, 2, 1, 1, 0, 2, 0, 0, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       2, 1, 1, 1, 0, 2, 0, 0, 2, 1, 1, 0, 0, 2, 1, 0, 2, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 2, 1, 1, 2, 0, 1, 1, 0, 1, 0, 0, 1, 0, 2, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 2, 2, 0, 0, 2, 1, 1, 2, 1])

In [42]:
modeloutputY_labelindices

array([0, 2, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1])

In [52]:
np.sum(testY_labelindices==modeloutputY_labelindices)

87

In [54]:
test_dataset.Y.shape

(107, 3)