In [1]:
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 21 14:42:02 2024

@author: Jeyak
"""
import numpy as np
import pandas as pd
import datetime as dt
# from math import abs
from adtk.detector import ThresholdAD
from fastapi.responses import JSONResponse
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split

# import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv1D, MaxPooling1D, TimeDistributed, Flatten, GRU, Dense

from app import db
from app.config import ModelConfig
from app.helper import load_model, save_model, get_var_name


models = {}
now = dt.datetime.now()

disposition_filepath = './data/ML_Training_Data.csv'
disposition_dataset = pd.read_csv(disposition_filepath)
# Rename all "Column" name to lowercase.
disposition_dataset.columns = map(str.lower, disposition_dataset.columns)

# ---
# anomaly_filepath = './data/cloudutilization_13082024.csv'
# anomaly_dataset = pd.read_csv(anomaly_filepath)
# anomaly_dataset.set_index('usagedate', drop=True, inplace=True)
# anomaly_dataset['servicename_id'] = anomaly_dataset['servicename'].apply(lambda x: get_var_name(x))

# ---
seasonal_data = pd.read_csv(r'./data/seasonal_data.csv')
months = ('jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec')

def update_current_time():
    global now
    now = dt.datetime.now()
    return now

def fn(x):
    x = x.lower().split('-')
    return now.year, months.index(x[0])+1, int(x[1])

seasonal_data['StartDate'] = seasonal_data['StartDate'].apply(lambda x: dt.datetime(*fn(x)))
seasonal_data['EndDate'] = seasonal_data['EndDate'].apply(lambda x: dt.datetime(*fn(x)))
seasonal_data['MinThresholdValue'] = seasonal_data['MinThresholdValue'].astype(np.float64)
seasonal_data['MaxThresholdValue'] = seasonal_data['MaxThresholdValue'].astype(np.float64)


# Define window size
window_size = ModelConfig.WINDOW_SIZE


def predict_disposition(model, label_encoders, df):
    # Label Encoding
    for column in df.columns:
        df[column] = LabelEncoder().fit_transform(df[column])

    # Predict
    y_pred = model.predict(df)
    df['dispositionname'] = label_encoders['disposition'].inverse_transform([y_pred])[0]
    return df


def get_model_id(model_name, service_name=None):
    model_name = get_var_name(model_name)
    if service_name is None:
        return model_name
    return f'{model_name}_{get_var_name(service_name)}' 

def make_Xy(data, n_input, n_out) :
    data = data.values
    X, y = list(), list()
    in_start = 0
    for _ in range(len(data)):
        in_end = in_start + n_input
        out_end = in_end + n_out
        if out_end <= len(data):
            x_input = data[in_start:in_end, : ]
            x_input = x_input.reshape((len(x_input), -1))
            X.append(x_input)
            y.append(data[in_end:out_end, : ])
        in_start += 1
    X = np.array(X)
    y = np.array(y)[ : , : , -1:]
    return X, y

def min_max_scale(data, scalers={}, inv=False):
    if(len(scalers)==0) :
        for i in range(data.shape[2]):
            scalers[i] = MinMaxScaler(feature_range=(0, 1))
            scalers[i].fit_transform(data[:, :, i])
            data[:, :, i] = scalers[i].transform(data[:, :, i])
        return data, scalers
    else :
        for i in range(data.shape[2]):
            if inv :
                data[:, :, i] = scalers[i].inverse_transform(data[:, :, i])
            else :
                data[:, :, i] = scalers[i].transform(data[:, :, i])
        return data
    
def get_service_list():
    query = 'SELECT DISTINCT servicename FROM cloudutilization';
    return {get_var_name(row[0]): row[0].strip() for row in db.sql(query)}

def get_service_row(service):
    query = 'SELECT servicename FROM cloudutilization';
    return {get_var_name(row[0]): row[0].strip() for row in db.sql(query)}
    
    
def get_saved_model(model_id):
    if models.get(model_id) is None:
        # Load the model
        models[model_id] = load_model(model_id=model_id)
    
    # Pickle
    return models[model_id]


def get_anomalies(new_pred_plot, prediction_df):
    low = new_pred_plot['last_original_days_value'].min()
    high = new_pred_plot['last_original_days_value'].max()
    threshold_detector = ThresholdAD(low=low, high=high)
    anomalies = threshold_detector.detect(prediction_df['next_predicted_days_value'])  
    return anomalies


def get_latest_season_data(seasonal_data, within_days=1):
    # today = dt.date.today()
    today = dt.datetime.now()
    tomorrow = (today + dt.timedelta(days=within_days))
    latest_seasonal_data = seasonal_data.loc[
        (seasonal_data['StartDate'] >= now) & (seasonal_data['EndDate'] <= tomorrow)
    ]
    print(f'Date Range: {today} - {tomorrow}')
    return latest_seasonal_data



Connecting to the database...
Connection String:  postgresql://postgres:postgres%40123@localhost:5432/ApplicationInsights
Connected. <cursor object at 0x0000026F86E98660; closed: 0>


In [2]:
# Cost Anamoly Detection 
model_name = 'anomalies'
service_name = None


service_list = {}
if service_name is None:
    # Predict for all the services one by one.
    service_list = get_service_list()
else:
    # Predict for particular service.
    service_list[get_var_name(service_name)] = service_name
    
for service_id, service_name in service_list.items():
    if '-' in service_name and len(service_name) == 36:
        # SKip "UUIDs"
        print(f"Warn: Skipping UUID <{service_name}>...")
        continue
    
    # Train the model for particular service.
    model_id = get_model_id(model_name, service_name)
    try:
        pkl = get_saved_model(model_id) 
    except Exception as e:
        print(f'Model file not found. It might not be trained due to an insufficient data. Msg: {str(e)}')
        continue
    
    for anomaly_dataset in db.df_fetch_data(
        tbl_name='cloudutilization', 
        where={'servicename': service_name}
    ):
        # print(anomaly_dataset)
        anomaly_dataset.set_index('usagedate', drop=True, inplace=True)
        anomaly_dataset['servicename_id'] = anomaly_dataset['servicename'].apply(lambda x: get_var_name(x))

        
        filter_columns = ['cloudprovidername', 'cloudtype']
        filter_df = anomaly_dataset[filter_columns]
        filter_df['pretaxcost'] = anomaly_dataset['pretaxcost']
        # Remove all None rows
        for clm in filter_columns:
            filter_df[clm] = filter_df[clm].fillna('(none)')
        
        for grp_name, df in filter_df.groupby(filter_columns):
            print(grp_name, df)

            data = df[['pretaxcost']]#.values.reshape(-1, 1)
            
            try:
                data_test = data[15:]
                X_test , y_test = make_Xy(data_test,15,1) 
            except IndexError:
                print(f'Warn: Unable to predict <{service_name} - {grp_name}> due to insufficient data.')
                continue
            (X_test_sc, scale_X_test) = min_max_scale(X_test.copy(), {})
            (y_test_sc, scale_y_test) = min_max_scale(y_test.copy(), {})
            
            lst_output = pkl['model'].predict(X_test_sc)
            # cb_output = scale_y_test[0].inverse_transform(cb_model.predict(X_test_sc)).reshape(-1)
    
            # Normalize the data
            scaler = MinMaxScaler(feature_range=(0, 1))
            data_normalized = scaler.fit_transform(data)
            closedf =  data_normalized.copy()
            
            training_size=int(len(closedf)*0.60)
            test_size=len(closedf)-training_size
            train_data,test_data= closedf[0:training_size,:], closedf[training_size:len(closedf),:1]
            # print("train_data: ", train_data.shape)
            # print("test_data: ", test_data.shape)
            
            
            time_step = 30
            x_input=test_data[len(test_data)-(time_step):].reshape(1,-1)
            temp_input=list(x_input)
            temp_input=temp_input[0].tolist()
            
            lst_output=[]
            n_steps=time_step
            i=0
            pred_days = 14
            
            try:
                while(i<pred_days):
                    if(len(temp_input)>time_step):
                        x_input=np.array(temp_input[1:])
                        #print("{} day input {}".format(i,x_input))
                        x_input = x_input.reshape(1,-1)
                        x_input = x_input.reshape((1, n_steps, 1))
                        
                        yhat = pkl['model'].predict(x_input, verbose=0)
                        #print("{} day output {}".format(i,yhat))
                        temp_input.extend(yhat[0].tolist())
                        temp_input=temp_input[1:]
                        #print(temp_input)
                       
                        lst_output.extend(yhat.tolist())
                        i=i+1
                        
                    else:
                        
                        x_input = x_input.reshape((1, n_steps,1))
                        yhat = pkl['model'].predict(x_input, verbose=0)
                        temp_input.extend(yhat[0].tolist())
                        
                        lst_output.extend(yhat.tolist())
                        i=i+1
            except ValueError:
                print(f'Warn: Unable to predict <{service_name} - {grp_name}> due to insufficient data.')
                continue
                           
            print("Output of predicted next days: ", len(lst_output))
            last_days=np.arange(1,time_step+1)
            day_pred=np.arange(time_step+1,time_step+pred_days+1)
            # print(last_days)
            # print(day_pred)
            
            temp_mat = np.empty((len(last_days)+pred_days+1,1))
            temp_mat[:] = np.nan
            temp_mat = temp_mat.reshape(1,-1).tolist()[0]
            
            last_original_days_value = temp_mat
            next_predicted_days_value = temp_mat
            
            last_original_days_value[0:time_step+1] = scaler.inverse_transform(closedf[len(closedf)-time_step:]).reshape(1,-1).tolist()[0]
            next_predicted_days_value[time_step:] = scaler.inverse_transform(np.array(lst_output).reshape(-1,1)).reshape(1,-1).tolist()[0]
            
            new_pred_plot = pd.DataFrame({
                'last_original_days_value':last_original_days_value,
                'next_predicted_days_value':next_predicted_days_value
            })
            pred_start_datetime = dt.datetime.now()
            pred_end_datetime = pred_start_datetime+dt.timedelta(pred_days)
            date_df = pd.date_range(start=pred_start_datetime-dt.timedelta(time_step-1), end=pred_end_datetime, freq='D')
            # date_df = pd.date_range(start=, end=pred_end_datetime, freq='D')
            
            # print(new_pred_plot.index)
            # print(len(date_df))
            
            new_pred_plot.index = date_df
            history = new_pred_plot.iloc[:time_step+1]
            prediction_df = new_pred_plot.iloc[time_step:]

            # Find the anomalies.
            anomaly_df = get_anomalies(new_pred_plot, prediction_df)
            detected_anamolies = anomaly_df.loc[anomaly_df == True]
            # print(prediction_df)
            # print(anomaly_df)

            # Set the 'usage_date'
            prediction_df.index.name = 'usage_date'
            prediction_df.reset_index(inplace=True)
            prediction_df['usage_date'] = prediction_df['usage_date'].dt.date

            # set other relevant columns
            prediction_df['service_name'] = service_name
            prediction_df['created_on'] = now
            prediction_df['created_by'] = 'CRON/API'
            for i, clm in enumerate(filter_columns):
                prediction_df[clm] = None if grp_name[i] == '(none)' else grp_name[i].strip()

            prediction_df.drop('last_original_days_value', axis=1, inplace=True)
            prediction_df.rename(columns={
                'cloudprovidername': 'cloud_provider_name',
                'cloudtype': 'cloud_type',
                'next_predicted_days_value': 'predicted_cost'
            }, inplace=True)
    
            # Save the prediction data into the database.
            db.insert_or_update(tbl_name='alert_anomalies', df=prediction_df, on_columns=['service_name', 'usage_date'])
            
            # Save the anomalies to the database.
            #! db.insert_or_update(tbl_name='cloudalertmessagesdetails', df=anomaly_df, on_columns=['usage_date'])
        
        if len(service_list) > 1:
            results = service_list
        else:
            results = prediction_df.to_json(orient='records')
print(results)

Executing "SELECT DISTINCT servicename FROM cloudutilization"...
Warn: Skipping UUID <f1c63817-e85f-b2f5-cbc8-f8d0f945c9d5>...
Warn: Skipping UUID <e30cc83a-fcb9-a82a-a37b-9af0ed4d68cf>...
Warn: Skipping UUID <a37735ab-7617-b5b3-a059-ca9120e18aee>...
Warn: Skipping UUID <44014c47-2a09-0409-a599-c8cbe974b101>...
Warn: Skipping UUID <e5ad3226-7932-6dc5-350b-22b64783debe>...
Warn: Skipping UUID <0b6a1ef1-be0a-de7d-7931-0acf8a18e357>...
Warn: Skipping UUID <b7bf570d-5315-1710-00b8-f865a5a1c884>...
Warn: Skipping UUID <ad9fa701-87bc-a99d-7b79-be1ba511b20a>...
Warn: Skipping UUID <c65b9ac8-36b4-d31d-89a7-f0a3b37179d0>...
Warn: Skipping UUID <c77272b9-ac5e-9747-9837-baad37472dd0>...
Warn: Skipping UUID <77462005-91da-72b1-c738-6bc5a3c8110d>...
Warn: Skipping UUID <4d1328d2-deeb-2036-3813-bcbeee76a898>...
Loading <./pkl/anomalies_amazon_relational_database_service.pkl>...
Executing SELECT * FROM cloudutilization WHERE servicename = 'Amazon Relational Database Service'...
('(none)', 'AWS       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df['pretaxcost'] = anomaly_dataset['pretaxcost']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df[clm] = filter_df[clm].fillna('(none)')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df['pretaxcost'] = anomaly_dataset['pretaxcost']
A value is trying to be set on a copy of a s

Executing SELECT * FROM cloudutilization WHERE servicename = 'AWS CloudTrail'...
('(none)', 'AWS                                               ')                     cloudprovidername  \
usagedate                               
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (no

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df['pretaxcost'] = anomaly_dataset['pretaxcost']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df[clm] = filter_df[clm].fillna('(none)')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442ms/step
Warn: Unable to predict <AWS CloudTrail - ('(none)', 'AWS                                               ')> due to insufficient data.
Warn: Skipping UUID <3910ebe0-63eb-189e-396d-0fa8f547d5b4>...
Warn: Skipping UUID <a1fbb87b-ed04-d57b-180f-eca44d5fe0c5>...
Warn: Skipping UUID <39316b3c-2967-b961-5876-e1842ae7ce89>...
Warn: Skipping UUID <7e4907d5-bf1f-9d36-b945-5500256a536f>...
Warn: Skipping UUID <184c123d-7f70-a1c5-44b1-81a784a1f4b8>...
Warn: Skipping UUID <53988246-e65c-27de-9db8-f507b6e5691f>...
Warn: Skipping UUID <f3433e39-684c-109a-9dd7-e6641203871c>...
Warn: Skipping UUID <22555296-eb4f-2be5-7940-608ad1646990>...
Warn: Skipping UUID <d9481c1c-1db8-e177-6409-30ddaa288069>...
Warn: Skipping UUID <babfb270-2a74-8d01-4ce4-e3f7a5fe35b3>...
Warn: Skipping UUID <813bddee-0cc5-ba18-2c49-9ade055b5f68>...
Warn: Skipping UUID <275be5cd-3652-9741-3b6f-58e10da82cff>...
Warn: Skipping UUID <267e72a0-7c8f-489a-9915-bff8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df['pretaxcost'] = anomaly_dataset['pretaxcost']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df[clm] = filter_df[clm].fillna('(none)')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df['pretaxcost'] = anomaly_dataset['pretaxcost']
A value is trying to be set on a copy of a s

Executing SELECT * FROM cloudutilization WHERE servicename = 'Storage'...
('(none)', 'Azure                                             ')            cloudprovidername  \
usagedate                      
2024-03-01            (none)   
2024-03-01            (none)   
2024-03-01            (none)   
2024-03-01            (none)   
2024-03-01            (none)   
...                      ...   
2024-07-01            (none)   
2024-07-01            (none)   
2024-08-01            (none)   
2024-07-01            (none)   
2024-08-01            (none)   

                                                    cloudtype  pretaxcost  
usagedate                                                                  
2024-03-01  Azure                                         ...        0.01  
2024-03-01  Azure                                         ...       21.15  
2024-03-01  Azure                                         ...        0.00  
2024-03-01  Azure                                         ...   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df['usage_date'] = prediction_df['usage_date'].dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df['service_name'] = service_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df['created_on'] = now
A value is trying to be set on a copy of a slice from a Data

TypeError: sequence item 0: expected str instance, datetime.date found

In [5]:
prediction_df['usage_date']

datetime.date(2024, 9, 4)

In [3]:
if not isinstance(prediction_df['usage_date'][0], str):
    prediction_df['usage_date'] = prediction_df['usage_date'].astype(str)
    
values = prediction_df['usage_date'].values
values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df['usage_date'] = prediction_df['usage_date'].astype(str)


array(['2024-08-31', '2024-09-01', '2024-09-02', '2024-09-03',
       '2024-09-04', '2024-09-05', '2024-09-06', '2024-09-07',
       '2024-09-08', '2024-09-09', '2024-09-10', '2024-09-11',
       '2024-09-12', '2024-09-13'], dtype=object)

In [4]:
# anomaly_df[1] = True
anomaly_df

2024-08-31 22:22:47.886710    False
2024-09-01 22:22:47.886710    False
2024-09-02 22:22:47.886710    False
2024-09-03 22:22:47.886710    False
2024-09-04 22:22:47.886710    False
2024-09-05 22:22:47.886710    False
2024-09-06 22:22:47.886710    False
2024-09-07 22:22:47.886710    False
2024-09-08 22:22:47.886710    False
2024-09-09 22:22:47.886710    False
2024-09-10 22:22:47.886710    False
2024-09-11 22:22:47.886710    False
2024-09-12 22:22:47.886710    False
2024-09-13 22:22:47.886710    False
Freq: D, Name: next_predicted_days_value, dtype: bool

In [5]:
# anomaly_df.
# detected_anamolies = anomaly_df.loc[anomaly_df == True]
detected_anamolies

Series([], Freq: D, Name: next_predicted_days_value, dtype: bool)

In [6]:
anomaly_dataset[['cloudprovidername', 'cloudtype']]

Unnamed: 0_level_0,cloudprovidername,cloudtype
usagedate,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-07-29 06:21:00,,AWS ...
2024-07-29 06:21:00,,AWS ...
2024-07-29 06:21:00,,AWS ...
2024-07-29 06:21:00,,AWS ...
2024-07-29 06:21:00,,AWS ...
...,...,...
2024-07-29 06:21:00,,AWS ...
2024-07-29 06:21:00,,AWS ...
2024-07-29 06:21:00,,AWS ...
2024-07-29 06:21:00,,AWS ...


In [7]:
grp_name[1]

'AWS                                               '

In [8]:

filter_columns = ['cloudprovidername', 'cloudtype']
filter_df = anomaly_dataset[filter_columns]
# Remove all None rows
for clm in filter_columns:
    filter_df[clm] = filter_df[clm].fillna('(none)')

for grp_name, df in filter_df.groupby(filter_columns):
    print(grp_name, df)

df



# prediction_df['service_name'] = service_name
# prediction_df['service_name'] = service_name
# prediction_df['service_name'] = service_name
# prediction_df

('(none)', 'AWS                                               ')                     cloudprovidername  \
usagedate                               
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
...                               ...   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   
2024-07-29 06:21:00            (none)   

                                                             cloudtype  
usagedate                                                               
2024-07-29 06:21:00  AWS                                           ...  
2024-07-29 06:21:00  AWS                                           ...  
2024-07-29 06:21:00  AWS                                           ...  
2024-07-29 06:21:00  AWS            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df[clm] = filter_df[clm].fillna('(none)')


Unnamed: 0_level_0,cloudprovidername,cloudtype
usagedate,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-07-29 06:21:00,(none),AWS ...
2024-07-29 06:21:00,(none),AWS ...
2024-07-29 06:21:00,(none),AWS ...
2024-07-29 06:21:00,(none),AWS ...
2024-07-29 06:21:00,(none),AWS ...
...,...,...
2024-07-29 06:21:00,(none),AWS ...
2024-07-29 06:21:00,(none),AWS ...
2024-07-29 06:21:00,(none),AWS ...
2024-07-29 06:21:00,(none),AWS ...


In [9]:
import datetime as dt
import numpy as np

seasonal_data = pd.read_csv(r'./data/seasonal_data.csv')
months = ('jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec')


def fn(x):
    x = x.lower().split('-')
    return now.year, months.index(x[0])+1, int(x[1])
    
seasonal_data['StartDate'] = seasonal_data['StartDate'].apply(lambda x: dt.datetime(*fn(x)))
seasonal_data['EndDate'] = seasonal_data['EndDate'].apply(lambda x: dt.datetime(*fn(x)))
seasonal_data['MinThresholdValue'] = seasonal_data['MinThresholdValue'].astype(np.float64)
seasonal_data['MaxThresholdValue'] = seasonal_data['MaxThresholdValue'].astype(np.float64)

def get_latest_season_data(seasonal_data, within_days=1):
    # today = dt.date.today()
    today = dt.datetime.now()
    tomorrow = (today + dt.timedelta(days=within_days))
    latest_seasonal_data = seasonal_data.loc[
        (seasonal_data['StartDate'] >= now) & (seasonal_data['EndDate'] <= tomorrow)
    ]
    print(f'Date Range: {today} - {tomorrow}')
    return latest_seasonal_data
    
latest_seasonal_data = get_latest_season_data(seasonal_data, within_days=120)
if not latest_seasonal_data.empty:
    # Season time now.
    min_threshold_value = latest_seasonal_data['MinThresholdValue'].min()
    max_threshold_value = latest_seasonal_data['MaxThresholdValue'].max()
    print(f"Note: Ignore if usage cost is in-between '{min_threshold_value}' & '{max_threshold_value}'")

    # Find abnormal usage cost.
    prediction_df = prediction_df.loc[
        (prediction_df['predicted_cost'] < min_threshold_value) | (prediction_df['predicted_cost'] > max_threshold_value)
    ]

prediction_df
# anomaly_dataset


Date Range: 2024-08-30 22:22:48.035614 - 2024-12-28 22:22:48.035614
Note: Ignore if usage cost is in-between '340.0' & '8000.0'


Unnamed: 0,usage_date,predicted_cost,service_name,cloud_provider_name,cloud_type
0,2024-08-31,1.01979,AWS Data Transfer,,AWS
1,2024-09-01,1.615681,AWS Data Transfer,,AWS
2,2024-09-02,2.301117,AWS Data Transfer,,AWS
3,2024-09-03,3.170714,AWS Data Transfer,,AWS
4,2024-09-04,4.358268,AWS Data Transfer,,AWS
5,2024-09-05,6.007575,AWS Data Transfer,,AWS
6,2024-09-06,8.440774,AWS Data Transfer,,AWS
7,2024-09-07,12.189543,AWS Data Transfer,,AWS
8,2024-09-08,15.66335,AWS Data Transfer,,AWS
9,2024-09-09,16.929394,AWS Data Transfer,,AWS


In [10]:
now.date()

datetime.date(2024, 8, 30)

In [2]:
# Disposition Prediction
model_name = 'disposition'
service_name = None


model_id = get_model_id(model_name, service_name)

# Load the saved model
pkl = get_saved_model(model_id) 
model = pkl['model']
label_encoders = pkl['encoders']

final_df = pd.DataFrame()
fetch_columns = ['appid', 'appname', 'appversion', 'age', 'businesscriticality', 'complexity', 'technologystack', 'focusarea']
required_columns=['appversion', 'age', 'businesscriticality', 'complexity', 'technologystack']
pred_table_columns = [
    'dispositionid', 'dispositionname', 'appname', 
    'focusarea', 'businesscriticality', 'technologystack', 
    # 'ballparkdevefforts', 'movegroup', 'rating'
]

for i, row in enumerate(db.fetch_data(tbl_name='applicationinventory', columns=fetch_columns)):
    # Convert into Dataframe
    df = pd.DataFrame([row], columns=fetch_columns)
    df.set_index('appid', inplace=True, drop=True)
    try:
        pred_df = predict_disposition(
            model, 
            label_encoders,
            df[required_columns]
        )['dispositionname']
        final_df = pd.concat([final_df, df.join(pred_df)])
    except Exception as e:
        print("Error:", row, e)

# Rename all "Column" name to lowercase.
final_df.columns = map(str.lower, final_df.columns)
print(final_df)
if not final_df.empty:
    # Remove unwanted columns from final prediction table.
    final_df.drop(['appversion', 'age', 'complexity'], axis=1, inplace=True)

# Insert into the database.
# db.insert_or_update_df(tbl_name='disposition', df=final_df)
on_columns = required_columns.copy()
on_columns.remove('complexity')
on_columns.remove('appversion')
on_columns.remove('age')
on_columns.append('appname')
db.insert_or_update(tbl_name='disposition', df=final_df, on_columns=on_columns)

Loading <./pkl/disposition.pkl>...
Executing SELECT appid, appname, appversion, age, businesscriticality, complexity, technologystack, focusarea FROM applicationinventory ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = LabelEncoder().fit_transform(df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = LabelEncoder().fit_transform(df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = LabelEncoder().fit_transform(df[column])
A value is trying to be set on a copy of a 

                              appname appversion  age businesscriticality  \
appid                                                                       
61                             Apollo          1    2              Simple   
62                             Apollo          1    1              Medium   
63                           Artefact          1    3            Critical   
64                           Artefact          1    4              Medium   
65                       VisualEditor          1    7            Critical   
66                       VisualEditor          1    5              Simple   
67                       VisualEditor          1    2              Medium   
68                       VisualEditor          1    1              Simple   
69                       VisualEditor          1    3              Medium   
70                       VisualEditor          1    7            Critical   
71                       VisualEditor          1    5              Medium   

UnboundLocalError: cannot access local variable 'values' where it is not associated with a value

In [16]:
final_df['businesscriticality'].iloc[0]

'Simple'