In [2]:
import os
import pickle
from glob import iglob
import json
from joblib import dump
from time import perf_counter
from multiprocessing import Pool

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.errors import ParserError
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, plot_confusion_matrix,
                             plot_roc_curve, precision_score, recall_score,
                             roc_auc_score)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from multimodel import MultiModel

In [2]:
run "../graph_tool.py"

In [26]:
%reload_ext autoreload
%autoreload 1 
%aimport modeling 

In [4]:
with open('../features/cond_encoding.pkl', 'rb') as encodings:
    target = pickle.load(encodings)
    target = [col_str.strip('/% ').rstrip() for col_str in target.keys()]
    target = [itm.split() for itm in target]
    for ind, itm in enumerate(target):
        while "/" in itm:
            itm.remove("/")
        target[ind] = "_".join(itm)

In [5]:
y_data = pd.read_csv('../data/profile.txt', sep="\t", header=None, names=target)
for col in y_data.columns:
    y_data[col].to_pickle(f"./target_variables/{col}.pkl")

In [6]:
features = [] 
for itm in iglob('../features/*.pkl'):
    filename = os.path.basename(itm)[:-4]
    if filename in ["cond_encoding", "sensor_info"]:
        continue
    features.append((filename, itm)) 
target_vars = [] 
for f_path in iglob("./target_variables/*.pkl"):
    filename = os.path.basename(f_path)[:-4]
    target_vars.append((filename, f_path))

In [7]:
args_list = []
for name, path in target_vars:
    for feature, pkl in features:
        args_list.append((pkl, path, feature, name))

In [8]:
def pool_func(args):
    start_t = perf_counter()
    _, _, feature, target = args
    model_inst = MultiModel(*args)
    model_inst.fit_model()
    dump(model_inst, f"../models/{target}_{feature}.pkl")
    with open(f"../models/{target}_{feature}.txt", "w") as log:
        log.writelines(model_inst.print_m())
    end_t = perf_counter()
    return (f"Target variable {target} with feature set {feature}", end_t - start_t)

In [9]:
for args in args_list:
    msg, duration = pool_func(args)
    print(f"{msg} took {duration} to compute.")

Target variable Cooler_Condition with feature set avg_3rds took 91.6036569 to compute.
Target variable Cooler_Condition with feature set avg_change took 26.36430710000002 to compute.
Target variable Cooler_Condition with feature set cycle_mean took 22.932991799999968 to compute.
Target variable Cooler_Condition with feature set dx_3rds took 53.131338400000004 to compute.
Target variable Cooler_Condition with feature set std_3rds took 49.67512549999998 to compute.
Target variable Cooler_Condition with feature set std_dev took 26.76164060000002 to compute.
Target variable Hydraulic_accumulator_bar with feature set avg_3rds took 104.21628130000005 to compute.
Target variable Hydraulic_accumulator_bar with feature set avg_change took 40.48629089999997 to compute.




Target variable Hydraulic_accumulator_bar with feature set cycle_mean took 50.6219433 to compute.
Target variable Hydraulic_accumulator_bar with feature set dx_3rds took 84.14763970000001 to compute.
Target variable Hydraulic_accumulator_bar with feature set std_3rds took 96.81083339999998 to compute.
Target variable Hydraulic_accumulator_bar with feature set std_dev took 53.431290500000046 to compute.
Target variable Internal_pump_leakage with feature set avg_3rds took 52.994415799999956 to compute.
Target variable Internal_pump_leakage with feature set avg_change took 39.4562919 to compute.




Target variable Internal_pump_leakage with feature set cycle_mean took 31.33738429999994 to compute.
Target variable Internal_pump_leakage with feature set dx_3rds took 78.0019873 to compute.
Target variable Internal_pump_leakage with feature set std_3rds took 82.62968189999992 to compute.
Target variable Internal_pump_leakage with feature set std_dev took 36.898608600000216 to compute.
Target variable stable_flag with feature set avg_3rds took 38.03125849999992 to compute.
Target variable stable_flag with feature set avg_change took 20.691381200000023 to compute.
Target variable stable_flag with feature set cycle_mean took 22.029792399999906 to compute.
Target variable stable_flag with feature set dx_3rds took 35.419987900000024 to compute.
Target variable stable_flag with feature set std_3rds took 39.72152260000007 to compute.
Target variable stable_flag with feature set std_dev took 22.223691299999928 to compute.
Target variable Valve_Condition with feature set avg_3rds took 97.1785



Target variable Valve_Condition with feature set cycle_mean took 52.222709800000075 to compute.
Target variable Valve_Condition with feature set dx_3rds took 102.35675130000004 to compute.
Target variable Valve_Condition with feature set std_3rds took 118.73700860000008 to compute.
Target variable Valve_Condition with feature set std_dev took 49.62918489999993 to compute.


In [24]:
test_summary = []
for text in iglob("../models/*.txt"):
    with open(text, 'r') as log:
        output = log.readlines()
        index = 0
        test_summary.append("-" * 60 + "\n")
        for line in output:
            if not index:
                test_summary.append(line.split("-")[-1] + "\n")
            if index == 1:
                test_summary.append(line.split("L")[0] + "\n")
            if "{" in line:
                 continue
            if line[0] in ["K", "X"]:
                test_summary.append(f"{line}\n")
            index += 1

In [25]:
with open('../models/test_summary.txt', 'w') as sum:
    sum.writelines(test_summary)