In [None]:
import warnings
from pathlib import Path
import pickle

import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    LabelEncoder,
    MinMaxScaler,
    OrdinalEncoder,
)

from config.config import logger
from config.config import ARTIFACTS_DIR

warnings.filterwarnings("ignore")

df = pd.read_csv("data/raw/data.csv")


def create_target(df: pd.DataFrame) -> pd.DataFrame:
    def type_of_failure(row_name):
        if df.loc[row_name, "TWF"] == 1:
            df.loc[row_name, "type_of_failure"] = "TWF"
        elif df.loc[row_name, "HDF"] == 1:
            df.loc[row_name, "type_of_failure"] = "HDF"
        elif df.loc[row_name, "PWF"] == 1:
            df.loc[row_name, "type_of_failure"] = "PWF"
        elif df.loc[row_name, "OSF"] == 1:
            df.loc[row_name, "type_of_failure"] = "OSF"
        elif df.loc[row_name, "RNF"] == 1:
            df.loc[row_name, "type_of_failure"] = "RNF"

    df.apply(lambda row: type_of_failure(row.name), axis=1)
    df["type_of_failure"].replace(np.NaN, "no failure", inplace=True)
    df.drop(["TWF", "HDF", "PWF", "OSF", "RNF"], axis=1, inplace=True)
    encoder = LabelEncoder()
    df["type_of_failure"] = encoder.fit_transform(df["type_of_failure"])
    logger.info("Target variable created")
    return df


def convert_to_celsius(df: pd.DataFrame) -> pd.DataFrame:
    df.drop(["UDI", "Product ID"], axis=1, inplace=True)
    df["Air temperature [c]"] = df["Air temperature [K]"] - 273.15
    df["Process temperature [c]"] = df["Process temperature [K]"] - 273.15
    df.drop(["Air temperature [K]", "Process temperature [K]"], axis=1, inplace=True)
    logger.info("Temperature converted to celsius")
    return df


def ordinal_encoding(df: pd.DataFrame) -> pd.DataFrame:
    encoder = OrdinalEncoder(categories=[["L", "M", "H"]])
    df["Type"] = encoder.fit_transform(df[["Type"]])
    logger.info("Type encoded")
    return df


def feature_scaling(df: pd.DataFrame) -> pd.DataFrame:
    scaler = MinMaxScaler()
    scale_cols = [
        "Rotational speed [rpm]",
        "Torque [Nm]",
        "Tool wear [min]",
        "Air temperature [c]",
        "Process temperature [c]",
    ]
    df_scaled = scaler.fit_transform(df[scale_cols])

    with open(Path(ARTIFACTS_DIR, "scaler.pkl"), "wb") as f:
        pickle.dump(scaler, f)


    df_scaled = pd.DataFrame(df_scaled)
    df_scaled.columns = scale_cols

    df.drop(scale_cols, axis=1, inplace=True)

    df_scaled = pd.concat([df, df_scaled], axis=1)
    logger.info("Features scaled")
    return df_scaled


def sampling(df: pd.DataFrame) -> pd.DataFrame:
    X = df.drop(["type_of_failure"], axis=1)
    y = df["type_of_failure"]
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)
    sampled_df = pd.concat([X, y], axis=1)
    logger.info("Data sampled")
    return sampled_df


target_cols = ["TWF", "HDF", "PWF", "OSF", "RNF"]
celsius_cols = ["UDI", "Product ID", "Air temperature [K]", "Process temperature [K]"]
categorical_cols = ["Type"]

feature_transformer = ColumnTransformer(
    transformers=[
        ("create_target", FunctionTransformer(create_target), target_cols),
        ("convert_to_celsius", FunctionTransformer(convert_to_celsius), celsius_cols),
        ("ordinal_encoding", FunctionTransformer(ordinal_encoding), ["Type"]),
    ],
    remainder="passthrough",
)

scaling_transformer = ColumnTransformer(
    transformers=[("feature_scaling", MinMaxScaler(), [1, 2, 4, 5, 6])], remainder="passthrough"
)


def preprocess(df):
    pipeline = Pipeline(
        steps=[("transformer", feature_transformer), ("scaling_transformer", scaling_transformer)]
    )

    result = pipeline.fit_transform(df)
    result = pd.DataFrame(result)

    X = result.drop(result.columns[5], axis=1)
    y = result[5]

    smote = SMOTE(sampling_strategy="auto")
    X_resampled, y_resampled = smote.fit_resample(X, y)
    result = pd.concat([X_resampled, y_resampled], axis=1)
    result.to_csv("data/processed/proc_data.csv")


In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from config.config import ARTIFACTS_DIR, logger

def prediction(type, rpm, torque, tool_wear, air_temp, process_temp):
    with open(Path(ARTIFACTS_DIR,'model1.pkl'), 'rb') as f:
        model1 = pickle.load(f)

    with open(Path(ARTIFACTS_DIR,'model2.pkl'), 'rb') as f:
        model2 = pickle.load(f)

    # type preprocessing
    if type == 'Low':
        type = int(0)
    elif type == 'Medium':
        type = int(1)
    elif type == 'High':
        type = int(2)

    type = float(type)


    # min max scaler
    with open(Path(ARTIFACTS_DIR, 'scaler.pkl'), 'rb') as f:
        scaler = pickle.load(f)
    scaled_input = scaler.transform([[rpm, torque, tool_wear, air_temp, process_temp]])
    rpm, torque, tool_wear, air_temp, process_temp = scaled_input[0]

    # print(rpm, torque, tool_wear, air_temp, process_temp)

    prediction1 = model1.predict([[type, rpm, torque, tool_wear, air_temp, process_temp]])

    if prediction1[0] == 0:
        result1 = 'No Failure'
    elif prediction1[0] == 1:
        result1 = 'Machine Failure'
    
    prediction2 = model2.predict([[type, rpm, torque, tool_wear, air_temp, process_temp]])
    prediction2 = int(prediction2)

    encoding = {0: 'Heat Dissipation Failure',
                1: 'Overstrain Failure',
                2: 'Power Failure',
                3: 'Random Failure',
                4: 'Tool Wear Failure',
                5: 'No Failure'}
    
    result2 = encoding[prediction2]

    print(result1, result2)

    return result1, result2

prediction('Low', 1412,	52.3,	218,25.15,	34.95)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ValueError: node array from the pickle has an incompatible dtype:
- expected: {'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', 'missing_go_to_left'], 'formats': ['<i8', '<i8', '<i8', '<f8', '<f8', '<i8', '<f8', 'u1'], 'offsets': [0, 8, 16, 24, 32, 40, 48, 56], 'itemsize': 64}
- got     : [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]

In [2]:
import warnings
from pathlib import Path
import typer
import pickle
import json

import pandas as pd
from sklearn.model_selection import train_test_split

from config import config
from config.config import ARTIFACTS_DIR
from data import (
    convert_to_celsius,
    create_target,
    feature_scaling,
    ordinal_encoding,
    sampling,
)
from train import model1, model2
from src.eda import (
    setup,
    question_one,
    question_two,
    question_three,
    question_four,
    question_five,
    question_six
)

warnings.filterwarnings("ignore")
app = typer.Typer()

def get_data():
    df = pd.read_csv("data/raw/data.csv")
    return df

def eda(df):
    df = setup(df)
    q1 = question_one(df)
    q2 = question_two(df)
    q3 = question_three(df)
    q4 = question_four(df)
    q5 = question_five(df)

    json_obj = {
        "q1": q1,
        "q2": q2,
        "q3": q3,
        "q4": q4,
        "q5": q5
    }

    with open(Path(ARTIFACTS_DIR, "eda.json"), "w+") as f:
        json.dump(json_obj, f)
    
# @app.command()
def preprocess():
    df = pd.read_csv(Path(config.DATA_DIR, "raw/data.csv"))
    df = create_target(df)
    df = convert_to_celsius(df)
    df = ordinal_encoding(df)
    df = feature_scaling(df)
    df = sampling(df)
    df.to_csv(Path(config.DATA_DIR, "processed/preprocessed.csv"), index=False)
    return df

# @app.command()
def split_data():
    df = pd.read_csv(Path(config.DATA_DIR, "processed/preprocessed.csv"))
    target = 'type_of_failure'
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df[target])
    train_data.to_csv(Path(config.DATA_DIR, "processed/train.csv"), index=False)
    test_data.to_csv(Path(config.DATA_DIR, "processed/test.csv"), index=False)


# @app.command()
def train():
    df = pd.read_csv(Path(config.DATA_DIR, "processed/train.csv"))
    scores_df, best_model, best_model_name, report = model1(df)
    print("Scores")
    print(scores_df)
    print("Best model")
    print(best_model)
    print("Classification report")
    print(report)
    with open(Path(ARTIFACTS_DIR, "model1.pkl"), "wb") as f:
        pickle.dump(best_model, f)

    scores_df = scores_df.to_json()
    report = report.to_json()
    model_metrics = [scores_df, report, best_model_name]
    with open(Path(ARTIFACTS_DIR, "model1_metrics.json"), "w+") as f:
        json.dump(model_metrics, f)


    scores_df, best_model, best_model_name, report = model2(df)
    print("Scores")
    print(scores_df)
    print("Best model")
    print(best_model)
    print("Classification report")
    print(report)

    with open(Path(ARTIFACTS_DIR, "model2.pkl"), "wb") as f:
        pickle.dump(best_model, f)

    scores_df = scores_df.to_json()
    report = report.to_json()
    model_metrics = [scores_df, report, best_model_name]
    with open(Path(ARTIFACTS_DIR, "model2_metrics.json"), "w+") as f:
        json.dump(model_metrics, f)

# @app.command()
# def generate_reports():
#     data_report()
#     model1_report()
#     model2_report()
    

# if __name__ == "__main__":
#     app()


# get_data()
eda(get_data())
# df = preprocess()
# split_data()
# print(df)
# train()


ImportError: cannot import name 'convert_to_celsius' from 'data' (unknown location)