In [1]:
import pandas as pd
import numpy as np
import mlexpt
import json
import os

In [2]:
config = json.load(open('/Users/stephenhky/PyProjects/ml-experiment/configs/LR_twitter_expt_set0_pca400.json', 'r'))
config

{'model': {'qualitative_features': ['text'],
  'binary_features': [],
  'quantitative_features': [],
  'target': 'polarity',
  'embedding_schemes': {'text': {'algorithm': 'PCA', 'target_dim': 400}},
  'algorithm': 'LogisticRegression',
  'model_parameters': {'device': 'cpu', 'nb_epoch': 100}},
 'data': {'path': '/Users/stephenhky/Downloads/trainingandtestdata/testdata.json',
  'missing_value_filling': {},
  'torchdevice': 'cpu'},
 'train': {'cross_validation': True,
  'cv_nfold': 5,
  'heldout_fraction': 0.05,
  'persist_model': True,
  'model_path': '/Users/stephenhky/Downloads/trainingandtestdata/model'},
 'statistics': {'topN': 2,
  'compute_class_performance': True,
  'class_performances_excel': None}}

In [3]:
## model config
qual_features = config['model'].get('qualitative_features', [])
binary_features = config['model'].get('binary_features', [])
quant_features = config['model'].get('quantitative_features', [])
dr_config = config['model'].get('embedding_schemes', {})
labelcol = config['model']['target']
algorithm = config['model']['algorithm']
model_param = config['model']['model_parameters']
## cross validation setup
do_cv = config['train']['cross_validation']
cv_nfold = config['train']['cv_nfold']
heldout_fraction = config['train']['heldout_fraction']
to_persist_model = config['train']['persist_model']
final_model_path = config['train']['model_path']
# data
datapath = config['data']['path']
missing_val_default = config['data']['missing_value_filling']
data_device = config['data']['torchdevice']
# statistics
topN = config['statistics']['topN']
to_compute_class_performances = config['statistics'].get('compute_class_performance', False)
class_performance_excel_file = config['statistics'].get('class_performances_excel', None)

In [4]:
from functools import partial
from mlexpt.data.adding_features import adding_no_features, convert_label_to_str
from mlexpt.experiment import add_multiple_features
from nltk import word_tokenize

def tokenize(datum):
    datum['text'] = word_tokenize(datum['text'])
    return datum

feature_adder = add_multiple_features([adding_no_features, 
                                       partial(convert_label_to_str, label=labelcol),
                                       tokenize])

In [5]:
from mlexpt.data.dataload import process_data

tempdir, nbdata = process_data(datapath,
                               qual_features, binary_features, quant_features,
                               labelcol,
                               feature_adder=feature_adder,
                               nb_lines_per_tempfile=500,
                               missing_val_default=missing_val_default)
print(tempdir.name)
print(nbdata)

/var/folders/pq/_srx84qj30j_byd7sxppfg940000gn/T/tmp608n54mi
498


In [6]:
from mlexpt.data.dataload import iterate_json_files_directory
from mlexpt.utils.core import generate_columndict

label2idx, idx2label = generate_columndict(iterate_json_files_directory(tempdir.name),
                                          [labelcol], [], [])
target_label_dict = {key[len(labelcol)+1:]: value for key, value in label2idx.items()}

	Qualitative Feature: polarity, number of distinct of values: 3


In [7]:
from mlexpt.utils.embeddings import embed_features

dimred_dict = embed_features(dr_config,
                                 [datum
                                  for datum in iterate_json_files_directory(tempdir.name,
                                                                            columns_to_keep=list(dr_config.keys())
                                                                            )
                                  ]
                                 )

	 Embedding Feature: text (PCA)
	Qualitative Feature: text, number of distinct of values: 2683


In [8]:
from mlexpt.utils.datatransform import generate_columndict_withembeddings

feature2idx, idx2feature = generate_columndict_withembeddings(iterate_json_files_directory(tempdir.name),
                                                              qual_features,
                                                              binary_features,
                                                              quant_features,
                                                              dimred_dict)

In [9]:
from mlexpt.utils.datatransform import NumericallyPreparedDataset

In [10]:
from mlexpt.data.dataload import assign_partitions

partitions = assign_partitions(nbdata, cv_nfold, heldout_fraction)

In [11]:
train_dataset = NumericallyPreparedDataset(iterate_json_files_directory(tempdir.name),
                                           feature2idx,
                                           qual_features,
                                           binary_features,
                                           quant_features,
                                           dimred_dict,
                                           labelcol,
                                           label2idx,
                                           assigned_partitions=partitions,
                                           interested_partitions=[partition
                                                                  for partition in range(cv_nfold)
                                                                  if partition > 0],
                                           device=data_device
                                           )

In [12]:
from mlexpt.ml.classifiers.linear import MulticlassLogisticRegression

model = MulticlassLogisticRegression(**model_param)
model.fit(train_dataset.X if isinstance(train_dataset.X, np.ndarray) else train_dataset.X.toarray(),
          train_dataset.Y if isinstance(train_dataset.Y, np.ndarray) else train_dataset.Y.toarray()
          )

 11%|█         | 11/100 [00:00<00:00, 106.94it/s]

Logistic regression trained on: cpu


100%|██████████| 100/100 [00:00<00:00, 100.46it/s]


In [13]:
test_dataset = NumericallyPreparedDataset(iterate_json_files_directory(tempdir.name),
                                                      feature2idx,
                                                      qual_features,
                                                      binary_features,
                                                      quant_features,
                                                      dimred_dict,
                                                      labelcol,
                                                      label2idx,
                                                      assigned_partitions=partitions,
                                                      interested_partitions=[-1],
                                                      device=data_device
                                                      )

In [14]:
predicted_Y = model.predict_proba(test_dataset.X if isinstance(test_dataset.X, np.ndarray) else test_dataset.X.toarray())

In [15]:
from mlexpt.metrics.statistics import extracting_stats_run

overall_performance, top_result_by_class, weighted_result_by_class, hit_result_by_class = \
    extracting_stats_run(predicted_Y,
                         test_dataset.Y if isinstance(test_dataset.Y, np.ndarray) else test_dataset.Y.toarray(),
                         target_label_dict,
                         topN)

  top_precision = top_confusion_matrix[idx, idx] / np.sum(top_confusion_matrix[idx, :])
  weighted_precision = weighted_confusion_matrix[idx, idx] / np.sum(weighted_confusion_matrix[idx, :])
  hit_precision = hit_confusion_matrix[idx, idx] / np.sum(weighted_confusion_matrix[idx, :])


In [16]:
overall_performance

{'nb_testdata': 17,
 'nb_equals': 5,
 'nb_weighted_equals': 7.5,
 'nb_hits': 10,
 'top1_accuracy': 0.29411764705882354,
 'weighted_accuracy': 0.4411764705882353,
 'hit_accuracy': 0.5882352941176471,
 'topN': 2}

In [17]:
top_result_by_class

{'4': {'recall': 1.0,
  'precision': 0.29411764705882354,
  'f_score': 0.45454545454545453},
 '2': {'recall': 0.0, 'precision': nan, 'f_score': nan},
 '0': {'recall': 0.0, 'precision': nan, 'f_score': nan}}

In [18]:
weighted_result_by_class

{'4': {'recall': 1.0,
  'precision': 0.29411764705882354,
  'f_score': 0.45454545454545453},
 '2': {'recall': 0.5,
  'precision': 0.29411764705882354,
  'f_score': 0.37037037037037035},
 '0': {'recall': 0.0, 'precision': nan, 'f_score': nan}}

In [19]:
from mlexpt.experiment import persist_model_files

persist_model_files(final_model_path, model, dimred_dict, feature2idx, label2idx, config)