Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ __pycache__/
*.py[cod]
*$py.class
.pytest_cache
**/.DS_Store
**/.DS_Store
check_data/
*.ipynb_checkpoints
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ jedi==0.18.1
joblib==1.2.0
kiwisolver==1.4.4
lightgbm==3.3.2
llvmlite==0.38.1
llvmlite==0.39.0
matplotlib==3.6.0
matplotlib-inline==0.1.6
munkres==1.1.4
mypy-extensions==0.4.3
nannyml==0.6.3
numba==0.55.2
numpy==1.22.4
numba==0.56.3
numpy==1.23.4
offset==0.0.2
packaging==21.3
pandas==1.5.1
Expand All @@ -53,7 +53,7 @@ patsy==0.5.3
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.2.0
pip==22.2.2
pip==22.3
platformdirs==2.5.2
plotly==5.10.0
pluggy==1.0.0
Expand Down
Empty file added src/analytics/__init__.py
Empty file.
11 changes: 11 additions & 0 deletions src/analytics/data/testing/classification_test_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
y_testing_binary,y_prediction_binary,y_testing_multi,y_prediction_multi
0,0,0,1
0,0,2,2
0,1,0,0
1,1,1,1
0,0,2,0
1,0,1,1
1,0,1,0
1,1,2,2
0,1,0,1
1,1,1,1
11 changes: 11 additions & 0 deletions src/analytics/data/testing/metrics_test_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
num1,num2,num3,cat1,cat2
15.0,,0,Cat,True
40.0,0.25,2,Dog,False
200.0,1.456,0,Cat,True
,45.896,1,Dog,False
60.0,2.67,2,,
48.0,9.748,1,Dog,False
1000.0,,1,Cat,True
43.0,1.67,2,Dog,False
1.0,0.00054,0,Cat,True
0.0,12.1,1,Dog,
106 changes: 106 additions & 0 deletions src/analytics/metrics/functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from sklearn.metrics import multilabel_confusion_matrix
import pandas as pd
from typing import Dict, Union


def format_feature_metrics(
missing_count: Dict[str, int],
non_missing_count: Dict[str, int],
mean: Dict[str, float],
minimum: Dict[str, float],
maximum: Dict[str, float],
sum: Dict[str, float],
standard_deviation: Dict[str, float],
variance: Dict[str, float],
) -> Dict[str, Union[int, float]]:
formated_metrics = {
"missing_count": missing_count,
"non_missing_count": non_missing_count,
"mean": mean,
"minimum": minimum,
"maximum": maximum,
"sum": sum,
"standard_deviation": standard_deviation,
"variance": variance,
}

return formated_metrics


def format_evaluation_metrics_binary(
accuracy: float,
precision: float,
recall: float,
f1: float,
tn: int,
fp: int,
fn: int,
tp: int,
) -> Dict[str, Union[int, float]]:
formated_metrics_for_binary = {
"accuracy": accuracy,
"precision": precision,
"recall": recall,
"f1": f1,
"true_negative": tn,
"false_positive": fp,
"false_negative": fn,
"true_positive": tp,
}

return formated_metrics_for_binary


def format_evaluation_metrics_multiple(
accuracy: float,
precision_statistics: Dict[str, float],
recall_statistics: Dict[str, float],
f1_statistics: Dict[str, float],
conf_matrix: Dict[str, Dict[str, int]],
) -> Dict[str, Union[float, Dict[str, Union[int, float]]]]:
formated_metrics_for_multiple = {
"accuracy": accuracy,
"precision_statistics": precision_statistics,
"recall_statistics": recall_statistics,
"f1_statistics": f1_statistics,
"multiple_confusion_matrix": conf_matrix,
}

return formated_metrics_for_multiple


def confusion_for_multiclass(
test_set: pd.DataFrame, prediction_set: pd.DataFrame
) -> Dict[str, Dict[str, int]]:
"""
Gets 2 datasets based on multiclass classification and calculates
the corresponding confusion matrix outputs tn, fp, fn, tp

Parameters
----------
test_set : pd.DataFrame
Multiclass ground truth labels.

y_score : pd.DataFrame
Multiclass predicted labels.

Returns
-------
mult_dict : Dict

"""
cm = multilabel_confusion_matrix(test_set, prediction_set)
mult_dict = {}
class_key = 0
for i in cm:
tn, fp, fn, tp = i.ravel()
eval_dict = {
"true_negative": tn,
"false_positive": fp,
"false_negative": fn,
"true_positive": tp,
}
mult_dict["class{}".format(class_key)] = eval_dict
class_key = class_key + 1
return mult_dict

179 changes: 179 additions & 0 deletions src/analytics/metrics/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from src.analytics.metrics.functions import *
from typing import Dict, Union


def create_feature_metrics_pipeline(
dataset: pd.DataFrame,
) -> Dict[str, Union[int, float]]:

"""
Feature metrics basic calculation

Calculates the basic metrics of a given dataset

Parameters
----------
dataset : pd.DataFrame
Given dataset for the calculation of metrics

Returns
-------
feature_metrics : Dict

The rerurned metrics are:
missing_count,
non_missing_count,
mean,
minimum,
maximum,
sum,
standard_deviation,
variance

"""
missing_count = dataset.isna().sum().to_dict()
non_missing_count = dataset.notna().sum().to_dict()
mean = dataset.mean(numeric_only=True).to_dict()
minimum = dataset.min(numeric_only=True).to_dict()
maximum = dataset.max(numeric_only=True).to_dict()
sum = dataset.sum(numeric_only=True).to_dict()
standard_deviation = dataset.std(numeric_only=True).to_dict()
variance = dataset.var(numeric_only=True).to_dict()

return format_feature_metrics(
missing_count,
non_missing_count,
mean,
minimum,
maximum,
sum,
standard_deviation,
variance,
)


def create_binary_classification_evaluation_metrics_pipeline(
test_set: pd.DataFrame, prediction_set: pd.DataFrame
) -> Dict[str, Union[int, float]]:

"""
Binary classification evaluation metrics

Calculates the evaluation metrics for binary classification
given two datasets

Parameters
----------
test_set : pd.DataFrame
Given ground truth dataset

prediction_set : pd.DataFrame
Given predictions dataset

Returns
-------
evaluation_metrics : Dict

The rerurned metrics are:
accuracy,
precision,
recall,
f1,
tn,
fp,
fn,
tp

"""

accuracy = metrics.accuracy_score(test_set, prediction_set)
precision = metrics.precision_score(test_set, prediction_set)
recall = metrics.recall_score(test_set, prediction_set)
f1 = recall = metrics.f1_score(test_set, prediction_set)
tn, fp, fn, tp = confusion_matrix(test_set, prediction_set).ravel()

return format_evaluation_metrics_binary(
accuracy, precision, recall, f1, tn, fp, fn, tp
)


def create_multiple_classification_evaluation_metrics_pipeline(
test_set: pd.DataFrame, prediction_set: pd.DataFrame
) -> Dict[str, Union[float, Dict[str, Union[int, float]]]]:
"""
Multiclass classification evaluation metrics

Calculates the evaluation metrics for multiclass classification
given two datasets

Parameters
----------
test_set : pd.DataFrame
Given ground truth dataset

prediction_set : pd.DataFrame
Given predictions dataset

Returns
-------
evaluation_metrics : Dict

The rerurned metrics are:

accuracy,

precision_statistics
micro_precision,
macro_precision,
weighted_precision,

recall
micro_recall,
macro_recall,
weighted_recall,

f1
micro_f1,
macro_f1,
weighted_f1

conf_matrix
tn,
fp,
fn,
tp
"""
accuracy = metrics.accuracy_score(test_set, prediction_set)
micro_precision = metrics.precision_score(test_set, prediction_set, average="micro")
macro_precision = metrics.precision_score(test_set, prediction_set, average="macro")
weighted_precision = metrics.precision_score(
test_set, prediction_set, average="weighted"
)
precision_statistics = {
"micro": micro_precision,
"macro": macro_precision,
"weighted": weighted_precision,
}

micro_recall = metrics.recall_score(test_set, prediction_set, average="micro")
macro_recall = metrics.recall_score(test_set, prediction_set, average="macro")
weighted_recall = metrics.recall_score(test_set, prediction_set, average="weighted")
recall_statistics = {
"micro": micro_recall,
"macro": macro_recall,
"weighted": weighted_recall,
}

micro_f1 = metrics.f1_score(test_set, prediction_set, average="micro")
macro_f1 = metrics.f1_score(test_set, prediction_set, average="macro")
weighted_f1 = metrics.f1_score(test_set, prediction_set, average="weighted")
f1_statistics = {"micro": micro_f1, "macro": macro_f1, "weighted": weighted_f1}
conf_matrix = confusion_for_multiclass(test_set, prediction_set)

return format_evaluation_metrics_multiple(
accuracy, precision_statistics, recall_statistics, f1_statistics, conf_matrix
)
Empty file added src/analytics/tests/__init__.py
Empty file.
Loading