In [None]:
import numpy as np
import pandas as pd
from google.cloud import storage
from google.oauth2 import service_account
from google.cloud import bigquery
from datetime import datetime
from sdv.single_table import CTGANSynthesizer,TVAESynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn import preprocessing,metrics
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as Func
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import seaborn as sns
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import pandas_gbq
import json
from mlxtend.evaluate import bias_variance_decomp
from imblearn.over_sampling import SMOTE,SMOTENC
from sklearn.svm import OneClassSVM
import statistics
from catboost import CatBoostClassifier
import os

os.environ['SCIPY_ARRAY_API'] = '1'

In [None]:
with open("finops-55-0c8185e843cd.json","r") as f:
    json_data=json.load(f)
f.close()
cred = service_account.Credentials.from_service_account_info(json_data)
client = bigquery.Client(project=json_data['project_id'],credentials=cred)


In [None]:
QUERY = (
    '''SELECT * from `finops-55.billing_1.synthesized_data_pseudo_label3_thres_51`''' )
query_job = client.query(QUERY)  # API request
df_model=query_job.to_dataframe()


In [None]:
df_model.shape

In [None]:
import os

# Set the environment variable before importing sklearn or scipy
os.environ['SCIPY_ARRAY_API'] = '1'

In [None]:
df_model['is_anomaly'].value_counts()

In [None]:
def preprocessing_all(df):
    for i in df.select_dtypes(include='object').columns:
        df[i] = df[i].str.lower()
        if df[i].isnull().sum()/len(df)>0 and i not in ['regionname','servicecategory']:
            print(i,df[i].isnull().sum())
            df[i].fillna(df.groupby(['cloud','servicename'])[i].transform(statistics.mode),inplace=True)
        elif df[i].isnull().sum()/len(df)>0 and i=='regionname':
            df[i].fillna(statistics.mode(df[i]),inplace=True)
        elif df[i].isnull().sum()/len(df)>0 and i=='servicecategory':
            df[i].fillna('Other',inplace=True)
    for i in df.select_dtypes(include=np.number).columns:
        if df[i].isnull().sum()/len(df)>0:
            df[i].fillna(df.groupby(['cloud','servicename'])[i].transform('mean'),inplace=True)
    return df

            

In [None]:
df_model['listcost'].skew()

In [None]:
df_model['listcost'].skew()

In [None]:
y = df_model['is_anomaly']
x = df_model.drop('is_anomaly',axis=1)
for i in ['chargeperiodstart_month', 'chargeperiodstart_day','chargeperiodstart_year']:
    x[i] = x[i].astype('str')
x_temp, x_test1, y_temp, y_test = train_test_split(x, y, test_size=0.2, random_state=42,stratify=y)
x_train1, x_val1, y_train1, y_val = train_test_split(x_temp, y_temp, test_size=0.125, random_state=42,stratify=y_temp)
# print(x_train.shape,x_val1.shape,x_test1.shape)
# overlap = pd.merge(x_train, x_test, how='inner')
# mask = x_train.apply(tuple, axis=1).isin(overlap.apply(tuple, axis=1))
# x_train1 = x_train[~mask]
# y_train1 = y_train[~mask]
x_train_scaled = preprocessing_all(x_train1)
x_val_scaled = preprocessing_all(x_val1)
x_test_scaled = preprocessing_all(x_test1)
# print(x_train_scaled.shape,x_val_scaled.shape,x_test_scaled.shape)
x_test=x_test1.copy()
x_val=x_val1.copy()
x_train = x_train1.copy()
y_train = y_train1.copy()






In [None]:
#Train and balancing
print(x_train_scaled.shape,x_test_scaled.shape,x_val_scaled.shape)
encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
x_train_cat = encoder.fit_transform(x_train_scaled.select_dtypes(include="object"))
x_train_cat = pd.DataFrame(x_train_cat, columns=encoder.get_feature_names_out(x_train_scaled.select_dtypes(include="object").columns),index=x_train_scaled.index)
print(x_train_cat.shape)
scaler = preprocessing.StandardScaler()
train_scaled_num = scaler.fit_transform(x_train_scaled.select_dtypes(include=np.number))
print(train_scaled_num.shape)
train_scaled_num = pd.DataFrame(train_scaled_num,columns=x_train_scaled.select_dtypes(include=np.number).columns,index=x_train_scaled.index)
print(train_scaled_num.shape)
x_train_balanced = pd.concat([train_scaled_num,x_train_cat],axis=1)
x_train_balanced = x_train_balanced.fillna(0)
print(x_train_balanced.shape)
smote = SMOTE(sampling_strategy='minority',random_state=0,k_neighbors=100)
x_train_smote,y_train_smote= smote.fit_resample(x_train_balanced, y_train)
print(x_train_smote.shape)
df_model_balanced_train = pd.concat([x_train_balanced,y_train],axis=1)
x_train = df_model_balanced_train.drop('is_anomaly',axis=1)
y_train = df_model_balanced_train['is_anomaly']
print(x_train_scaled.shape,x_test_scaled.shape,x_val_scaled.shape)




In [None]:
set(x_train_scaled.columns)-set(x_test_scaled.columns)

In [None]:
# Val
x_val_cat = encoder.transform(x_val_scaled.select_dtypes(include="object"))
x_val_cat = pd.DataFrame(x_val_cat, columns=encoder.get_feature_names_out(x_val_scaled.select_dtypes(include="object").columns),index=x_val_scaled.index)
val_scaled = scaler.transform(x_val_scaled.select_dtypes(include=np.number))
x_val_num = pd.DataFrame(val_scaled, columns = x_val_scaled.select_dtypes(include=np.number).columns,index=x_val_scaled.index)
x_val = pd.concat([x_val_cat,x_val_num],axis=1)

#Test
x_test_cat = encoder.transform(x_test_scaled.select_dtypes(include="object"))
x_test_cat = pd.DataFrame(x_test_scaled, columns=encoder.get_feature_names_out(x_test_scaled.select_dtypes(include="object").columns),index=x_test_scaled.index)
test_scaled = scaler.transform(x_test_scaled.select_dtypes(include=np.number))
x_test_num = pd.DataFrame(test_scaled, columns = x_test_scaled.select_dtypes(include=np.number).columns,index=x_test_scaled.index)
x_test = pd.concat([x_test_cat,x_test_num],axis=1)
print(x_train.shape,x_val.shape,x_test.shape)



In [None]:
order = x_train.columns
x_test = x_test.reindex(columns=order)
x_val = x_val.reindex(columns=order)


In [None]:
pca=PCA(n_components=287)
x_train,x_test,x_val=x_train.fillna(0),x_test.fillna(0),x_val.fillna(0)
x_train_pca=pca.fit_transform(x_train)
x_test_pca=pca.transform(x_test)
x_val_pca = pca.transform(x_val)

In [None]:
x_train_pca.shape

### Logistic Regression

In [None]:
logreg = LogisticRegression(random_state=0)
logreg.fit(x_train_pca, y_train)

# Predictions
y_pred_train = logreg.predict(x_train_pca)
y_pred = logreg.predict(x_test_pca)
y_pred_prob = logreg.predict_log_proba(x_test_pca)[:, 1]
# Evaluation
print("Classification Report")
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred))

In [None]:
plotcon = ConfusionMatrixDisplay(confusion_matrix=metrics.confusion_matrix(y_test,y_pred),display_labels=logreg.classes_)
plotcon.plot()
plt.show()

In [None]:
from sklearn.model_selection import KFold,cross_val_score
kf = KFold(shuffle=True,n_splits=5,random_state=0)
score = cross_val_score(logreg,x_test_pca,y_test,cv=kf,scoring='roc_auc')
np.mean(score)



In [None]:
mse, bias, variance = bias_variance_decomp(
    logreg, np.array(x_train_pca), np.array(y_train), np.array(x_test_pca), np.array(y_test),
    loss='0-1_loss',  # Classification error
    num_rounds=100, 
    random_seed=42
)

In [None]:
mse, bias, variance

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Calculate FPR, TPR, and thresholds
fpr_lg, tpr_lg, thresholds_lg = roc_curve(y_test, y_pred_prob)

# Calculate the AUC score
roc_auc_lg = roc_auc_score(y_test, y_pred_prob)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_lg, tpr_lg, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_lg:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guessing')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

### Random Forest

In [None]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(x_train_pca, y_train)

In [None]:
y_pred_rf = model_rf.predict(x_test_pca)
y_pred_train_rf  =model_rf.predict(x_train_pca)
y_pred_prob_rf = model_rf.predict_proba(x_test_pca)[:, 1]

# Calculate accuracy

# Print a classification report
print("Classification Report: Train")
print(classification_report(y_train, y_pred_train))
print("Classification Report Test")
print(classification_report(y_test, y_pred_rf))


In [None]:
plotcon = ConfusionMatrixDisplay(confusion_matrix=metrics.confusion_matrix(y_test,y_pred_rf),display_labels=model_rf.classes_)
plotcon.plot()
plt.show()

In [None]:
from sklearn.model_selection import KFold,cross_val_score
kf = KFold(shuffle=True,n_splits=5,random_state=0)
score = cross_val_score(model_rf,x_test_pca,y_test,cv=kf,scoring='roc_auc')
np.mean(score)


In [None]:
y_pred_prob_rf = model_rf.predict_proba(x_test_pca)[:, 1]

# Calculate FPR, TPR, and thresholds
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_pred_prob_rf)

# Calculate the AUC score
roc_auc_rf = roc_auc_score(y_test, y_pred_prob_rf)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_rf:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guessing')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve for Random Forest')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig('roc_curve_rf.png')

print(f"The AUC score is: {roc_auc_rf:.4f}")

In [None]:
from sklearn.model_selection import KFold,cross_val_score
kf = KFold(shuffle=True,n_splits=5,random_state=0)
score = cross_val_score(model_rf,x_test_pca,y_test,cv=kf,scoring='roc_auc')
np.mean(score)


In [None]:
# mse, bias, variance = bias_variance_decomp(
#     model_rf,X_train=np.array(x_train_pca), y_train=np.array(y_train), X_test=np.array(x_test_pca), y_test=np.array(y_test),
#     loss='0-1_loss',  # Classification error
#     num_rounds=10, 
#     random_seed=42
# )
# mse, bias, variance

In [None]:
param_grid = {
    'n_estimators': [50,75,100,125,150,200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [None]:
grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid, scoring='recall')
grid_search.fit(x_val_pca, y_val)

In [None]:
y_pred_grid = grid_search.predict(x_test_pca)
print(classification_report(y_test,y_pred_grid))

### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=200,weights='distance')
knn.fit(x_train_pca, y_train)

y_pred_knn = knn.predict(x_test_pca)
print(classification_report(y_test, y_pred_knn))
print(metrics.confusion_matrix(y_test,y_pred_knn))
y_pred_prob_knn = knn.predict_proba(x_test_pca)[:, 1]



In [None]:
plotcon = ConfusionMatrixDisplay(confusion_matrix=metrics.confusion_matrix(y_test,y_pred_knn),display_labels=knn.classes_)
plotcon.plot()
plt.show()

In [None]:
mse, bias, variance = bias_variance_decomp(
    knn, np.array(x_train_pca), np.array(y_train), np.array(x_test_pca), np.array(y_test),
    loss='0-1_loss',  # Classification error
    num_rounds=10, 
    random_seed=42
)
mse, bias, variance

In [None]:
y_pred_prob_knn = knn.predict_proba(x_test_pca)[:, 1]

# Calculate FPR, TPR, and thresholds
fpr_knn, tpr_knn, thresholds = roc_curve(y_test, y_pred_prob_knn)

# Calculate the AUC score
roc_auc_knn = roc_auc_score(y_test, y_pred_prob_knn)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_knn, tpr_knn, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_knn:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guessing')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve for KNN')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig('roc_curve_rf.png')

print(f"The AUC score is: {roc_auc_knn:.4f}")

### Catboost

In [None]:
categorical_features = x_train1.select_dtypes(include=['object', 'category']).columns.tolist()
for i in categorical_features:
    x_train1[i]=x_train1[i].astype('str')
print(categorical_features)

In [None]:
model_catboost = CatBoostClassifier(
    iterations=500,
    learning_rate=0.0001,
    depth=6,
    loss_function='Logloss', 
    eval_metric='Recall',         
    random_seed=42,
    verbose=0,
    cat_features = categorical_features
)

model_catboost.fit(x_train1.fillna(''),y_train)
y_pred_cb = model_catboost.predict(x_test1.fillna(''))
y_pred_train = model_catboost.predict(x_train1.fillna(''))
y_pred_prob_cat = model_catboost.predict_proba(x_test1.fillna(''))[:, 1]




importances = model_catboost.get_feature_importance(prettified=True)
print(importances.head(20))

In [None]:
print(classification_report(y_train, y_pred_train))

In [None]:
print(classification_report(y_test, y_pred_cb))

In [None]:
plotcon = ConfusionMatrixDisplay(confusion_matrix=metrics.confusion_matrix(y_test,y_pred_cb),display_labels=model_catboost.classes_)
plotcon.plot()
plt.show()

In [None]:
# mse, bias, variance = bias_variance_decomp(
#     model_catboost, np.array(x_train1), np.array(y_train), np.array(x_test1), np.array(y_test),
#     loss='0-1_loss',  # Classification error
#     num_rounds=10, 
#     random_seed=42
# )
# mse, bias, variance

In [None]:
y_pred_prob_cb = model_catboost.predict_proba(x_test1.fillna(''))[:, 1]

# Calculate FPR, TPR, and thresholds
fpr_cb, tpr_cb, thresholds_cb = roc_curve(y_test, y_pred_prob_cb)

# Calculate the AUC score
roc_auc_cb = roc_auc_score(y_test, y_pred_prob_cb)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_cb, tpr_cb, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_cb})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guessing')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve for CatBoost')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig('roc_curve_rf.png')

print(f"The AUC score is: {roc_auc_cb:.4f}")

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr_cb, tpr_cb, color='blue', lw=2, label=f'ROC curve CB(AUC = {round(roc_auc_cb,3)})')
plt.plot(fpr_lg, tpr_lg, color='green', lw=2, label=f'ROC curve LR(AUC = {roc_auc_lg:.2f})')
plt.plot(fpr_knn, tpr_knn, color='cyan', lw=2, label=f'ROC curve KNN(AUC = {roc_auc_knn:.2f})')
plt.plot(fpr_rf, tpr_rf, color='purple', lw=2, label=f'ROC curve RF(AUC = {roc_auc_rf:.2f})')

plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guessing')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve for Classifiers')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig('roc_curve_rf.png')



In [None]:
train_tuples = set(map(tuple, x_train.values))
test_tuples  = set(map(tuple, x_test.values))
overlap = train_tuples & test_tuples
print(f"Number of overlapping rows: {len(overlap)}")

In [None]:
result = pd.concat([x_test1,pd.DataFrame({'is_anomaly':y_pred},index=x_test1.index)],axis=1)

In [None]:
input_for_recom = result[(result['cloud']=='gcp') & (result['servicename']=='compute engine') & (result['is_anomaly']==1)].sort_values(by='listcost',ascending=False).head(1)

In [None]:
data_for_input=input_for_recom.to_dict(orient='list')

In [None]:
data_for_input

In [None]:
with open("input_data_for_recomm.json", "w") as f:
        json.dump(data_for_input, f)
f.close()