# PHM Data Challenge 2014

In [None]:
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from collections import Counter

## Data Loading

In [None]:
consumption = pd.read_csv("../data/Train - Part Consumption.csv")

consumption.head()

In [None]:
usage = pd.read_csv("../data/Train - Usage.csv")
usage.head()

In [None]:
failures = pd.read_csv("../data/Train - Failures.csv")
failures.head()

## Data Analysis

In [None]:
consumption.describe()

In [None]:
usage.describe()

In [None]:
failures.describe()

In [None]:
c = Counter(consumption.Reason)
reason_count = c.most_common()
reason_count

In [None]:
c = Counter(failures.Asset)
failures_count = c.most_common()
failures_count

## Data Preprocessing

In [None]:
indexes = consumption.query("Quantity <= 0").index
consumption.drop(indexes, inplace=True)
consumption.reset_index(inplace=True)

In [None]:
consumption.Quantity.describe()

## Article plots reproduction

In [None]:
assets_un = consumption.Asset.unique()
print(assets_un)
print(len(assets_un))

In [None]:
mean_usages = []
std_usages = []
for asset in tqdm(assets_un):
    asset_usages = usage.query("Asset == @asset").Use.to_list()
    if len(asset_usages) > 0:
        mean_usages.append(np.mean(asset_usages))
        std_usages.append(np.std(asset_usages))

mean_usages = list(map(lambda x: x / 1e4, mean_usages))

plt.figure(figsize=(14, 6))
plt.hist(mean_usages, bins=500)
plt.grid(axis="y", alpha=0.75)
plt.xlim(2.5, 3.6)
plt.ylim(0, 100)
plt.xlabel("Mean of usage $(10^4)$", fontsize=15)
plt.ylabel("Frequency count", fontsize=15)
plt.title("Histogram of assets usage mean", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(14, 6))
plt.plot(mean_usages, std_usages, "ro")
plt.xlim(2.5, 3.6)
plt.ylim(0, 1000)
plt.xlabel("Mean of usage", fontsize=15)
plt.ylabel("Std of usage", fontsize=15)
plt.title("Assets usage clustering", fontsize=15)
plt.show()

In [None]:
mean_usages = []
std_usages = []
for asset in tqdm(assets_un):
    asset_usages = usage.query("Asset == @asset").Use.to_list()
    if len(asset_usages) > 0:
        mean_usages.append(np.mean(asset_usages) - asset_usages[0])
        std_usages.append(np.std(asset_usages))

plt.figure(figsize=(14, 6))
plt.hist(mean_usages, bins=500)
plt.grid(axis="y", alpha=0.75)
plt.xlim(0, 3000)
plt.xlabel("Mean of usage", fontsize=15)
plt.ylabel("Frequency count", fontsize=15)
plt.title("Histogram of assets usage mean after first measure", fontsize=15)
plt.show()

In [None]:
parts = consumption.Part.unique()
print(parts)
print(len(parts))

In [None]:
assets_cons, parts_cons = consumption.Asset, consumption.Part

assets_parts = []
for asset, part in zip(assets_cons, parts_cons):
    assets_parts.append(asset + "_" + part)
assets_parts = np.unique(assets_parts)

print(len(assets_parts))

In [None]:
parts_usages_mean = []
parts_usages_std = []
parts_usages_count = []
assets = []
parts = []
for asset_part in tqdm(assets_parts):
    asset, part = asset_part.split("_")
    consumptions_times = consumption.query("Asset == @asset and Part == @part")

    if len(consumptions_times) > 0:
        try:
            usage_asset_mean_each_t = mean_usages[list(assets_un).index(asset)] / 730
            assets.append(asset)
            parts.append(part)

            usages_snapshots = []
            for cons_time in consumptions_times.Time:
                usages_snapshots.append(cons_time * usage_asset_mean_each_t)

            parts_usages_mean.append(np.mean(usages_snapshots))
            parts_usages_std.append(np.std(usages_snapshots))
            parts_usages_count.append(len(usages_snapshots))
        except IndexError:
            pass

In [None]:
print(
    len(assets),
    len(parts),
    len(parts_usages_mean),
    len(parts_usages_std),
    len(parts_usages_count),
)

data = {
    "asset": assets,
    "part": parts,
    "consumption_usage_mean": parts_usages_mean,
    "consumption_usage_std": parts_usages_std,
    "consumption_counts": parts_usages_count,
}

parts_usage_df = pd.DataFrame(data)
parts_usage_df.to_csv("../data/parts_usages.csv", index=False)
parts_usage_df.head(10)

In [None]:
parts_un = parts_usage_df.part.unique()

for part in parts_un:
    print(parts_usage_df.query("part == @part"))

## Training Data construction

In [None]:
train_df = consumption.copy()
train_df.drop("index", axis=1, inplace=True)

In [None]:
train_df.shape

In [None]:
train_df["Failure"] = [False] * train_df.shape[0]
train_df["Time_failure"] = [0] * train_df.shape[0]
train_df["Time_diff"] = [0] * train_df.shape[0]
train_df["Usage_on_failure"] = [0] * train_df.shape[0]

In [None]:
train_df.head()

In [None]:
# for fail_asset, fail_time in tqdm(list(failures.itertuples(index=False, name=None))):
#     possible_cons = consumption.query("Time <= @fail_time and Asset == @fail_asset")

#     usage_next = usage.query("Time >= @fail_time and Asset == @fail_asset").head(1)
#     usage_prev = usage.query("Time <= @fail_time and Asset == @fail_asset").tail(1)

#     usage_failure_value = 0
#     if len(usage_next) > 0 and len(usage_prev) > 0:
#         usage_prev_time = usage_prev.iloc[0, 1]
#         usage_prev_value = usage_prev.iloc[0, 2]
#         usage_next_time = usage_next.iloc[0, 1]
#         usage_next_value = usage_next.iloc[0, 2]

#         if usage_next_time - usage_prev_time > 0:
#             usage_failure_value = fail_time - usage_prev_time
#             usage_failure_value /= usage_next_time - usage_prev_time
#             usage_failure_value *= usage_next_value - usage_prev_value
#             usage_failure_value += usage_prev_value
#         else:
#             usage_failure_value = usage_next_value

#     inserted_parts = []
#     for index, cons_part in possible_cons[::-1].iterrows():
#         if cons_part.Part not in inserted_parts:
#             train_df.loc[index, "Failure"] = True
#             train_df.loc[index, "Time_failure"] = fail_time
#             train_df.loc[index, "Time_diff"] = (
#                 fail_time - consumption.loc[index, "Time"]
#             )
#             train_df.loc[index, "Usage_on_failure"] = usage_failure_value
#             inserted_parts.append(cons_part.Part)

train_df = pd.read_csv("../data/train_features.csv")

In [None]:
train_df.query("Failure==True")

In [None]:
train_df.to_csv("../data/train_features.csv", index=False)

This train set above does not really express others examples by considering some negative failures instances. So, I'll do this on Usage_on_failure column, by just measuring the usage on the Time column.

In [None]:
# train_df.drop(["Time_failure", "Time_diff", "Usage_on_failure"], axis=1, inplace=True)

# for index, row in tqdm(train_df[["Asset", "Time"]].iterrows()):
#     time = row.Time
#     asset = row.Asset
#     usage_next = usage.query("Time >= @time and Asset == @asset").head(1)
#     usage_prev = usage.query("Time <= @time and Asset == @asset").tail(1)

#     usage_value = 0
#     if len(usage_next) > 0 and len(usage_prev) > 0:
#         usage_prev_time = usage_prev.iloc[0, 1]
#         usage_prev_value = usage_prev.iloc[0, 2]
#         usage_next_time = usage_next.iloc[0, 1]
#         usage_next_value = usage_next.iloc[0, 2]

#         if usage_next_time - usage_prev_time > 0:
#             usage_value = row.Time - usage_prev_time
#             usage_value /= usage_next_time - usage_prev_time
#             usage_value *= usage_next_value - usage_prev_value
#             usage_value += usage_prev_value
#         else:
#             usage_value = usage_next_value

#     train_df.loc[index, "Usage_on_time"] = usage_value

train_df = pd.read_csv("../data/train_features_usage.csv")

In [None]:
train_df.to_csv("../data/train_features_usage.csv", index=False)