In [None]:
from azure.cosmos import CosmosClient, exceptions

import csv 
import json
import sys

%load_ext autoreload
%autoreload
from src.AzureCosmos_connector import COSMOS_CONNECTOR
from src.config import azure_schemas

In [None]:
cosmosDB = COSMOS_CONNECTOR()

In [None]:
cosmosDB.list_databases()
print("="*10)
cosmosDB.find_database("image-embeddings-db")
print("="*10)
emd_db = cosmosDB.get_database("image-embeddings-db")

In [None]:
cosmosDB.list_containers(emd_db)
print("="*10)
cosmosDB.find_container(emd_db, "bert-encodings")
print("="*10)
bertContainer= cosmosDB.get_container(emd_db, "bert-encodings")

In [None]:
items = list(bertContainer.query_items(
        query="SELECT * FROM r WHERE r.id=@id",
        parameters=[
            { "name":"@id", "value": "images_000000000001" }
        ],
        enable_cross_partition_query=True
    ))
items

In [None]:
import pandas as pd
dflist = []
for item in bertContainer.query_items(
    query='SELECT * FROM c',
    enable_cross_partition_query = True):
    dflist.append(dict(item))
df = pd.DataFrame(dflist)
df.head()

In [None]:
df.to_csv("./data/azure_data.csv")

In [None]:
from matplotlib.pyplot import axis
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
len(df['encd'].to_list()[0])

In [None]:
feature_dataset = df.copy()
column_names = [f"encd_{i}" for i in range(768)]
feature_dataset[column_names] = pd.DataFrame(df['encd'].to_list(), columns=column_names)
feature_dataset.head()

In [None]:
""" PCA => K-Means: curse of Dimensionality 
"""
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score
from sklearn.preprocessing import StandardScaler

features = feature_dataset[column_names].values
features[:5]

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
pca = PCA().fit(scaled_features)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, 701, step=20)
y = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, y[xi], marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 701, step=20)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.show()

pca = PCA(0.95).fit(scaled_features)
reduced_features = pca.transform(scaled_features)
reduced_features[:5]

In [None]:
""" K-Means (Partitional Clustering)
"""
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

In [None]:
# %%
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42
}
sse = []
k_samples = range(1, 103, 3)

for k in k_samples:
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(reduced_features)
    sse.append(kmeans.inertia_)

plt.style.use("fivethirtyeight")
plt.plot(k_samples, sse)
plt.xticks(k_samples)
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
# choosing the elbow point of the curve
kl = KneeLocator(k_samples, sse, curve="convex", direction="decreasing")
print(kl.elbow)

kmeans = KMeans(n_clusters=kl.elbow, **kmeans_kwargs)
kmeans.fit(reduced_features)

#%%
# kmeans.cluster_centers_.shape
def map_centroid(row):
    return kmeans.cluster_centers_[row["kmeans_cluster_id"]].tolist()
# %%
df["kmeans_cluster_id"] = kmeans.labels_
display(df.head(2))
df["kmeans_cluster_centroid"] = df.apply(map_centroid, axis=1)
display(df.head(2))
df['kmeans_cluster_id'].value_counts().plot(kind='bar')

In [None]:
""" store PCA & clustering model
"""
from joblib import dump, load
# scaler
dump(scaler, './models/full/scaler.joblib')
dump(pca, './models/full/pca.joblib')
dump(kmeans, './models/full/kmeans.joblib')

In [None]:
cosmosDB.list_containers(emd_db)
print("="*10)
cosmosDB.find_container(emd_db, "clustered-meta-data")
print("="*10)
bertContainer= cosmosDB.get_container(emd_db, "clustered-meta-data")

In [None]:
df.head(1)

In [None]:
container_client.id

In [None]:
# database_client = client.get_database_client("image-embeddings-db")
container_client = emd_db.get_container_client("clustered-meta-data")
num = 1

for row in df.to_dict('records'):
    try:
        container_client.upsert_item(
            {
                'id': row["id"],
                'cluster_id': row["kmeans_cluster_id"],
                'cluster_centroid':row["kmeans_cluster_centroid"],
                'dataset': row["dataset"],
                'url': row['url'],
                'encd': row['encd'],
                'img_name': row['img_name']
            }
        )      
    except exceptions.CosmosHttpResponseError as e:
        print("Failed to insert {}, row number {}".format(row['id'], num))
        print(e)
    else:
        print("Inserted {}, row number {}".format(row['id'], num))
    num += 1

In [None]:
str(tuple(["images_000000000001", "images_000000000016"]))
# '\',\''.join(Ids)

In [None]:
# print('\n1.2 Reading Item by Id\n')
# doc_id = "images_000000000001"
# # Note that Reads require a partition key to be spcified.
# response = container_client.read_item(item=doc_id, partition_key=doc_id)
Ids = ["images_000000000001", "images_000000000016"]
Ids = str(tuple(["images_000000000001", "images_000000000016"]))
items = list(container_client.query_items(
            query=f"SELECT * FROM r WHERE r.id IN {Ids}",
            # parameters=[
            #     { "name":"@ids", "value": "\',\'".join(Ids) }
            # ],
            enable_cross_partition_query=True
        ))
pd.DataFrame(items)

In [None]:
items = list(container_client.query_items(
            query="SELECT * FROM r WHERE r.id IN (@ids)",
            parameters=[
                { "name":"@ids", "value": ','.join(Ids) }
            ],
            enable_cross_partition_query=True
        ))
items

In [None]:
def bulk_insert(client: CosmosClient, csvFilePath):
    database_client = client.get_database_client("image-embeddings-db")
    container_client = database_client.get_container_client("clustered-meta-data")

    #read csv file
    with open(csvFilePath, encoding='utf-8') as csvf: 
        #load csv file data using csv library's dictionary reader
        csvReader = csv.DictReader(csvf) 

        num = 1

        #convert each csv row into python dict
        for row in csvReader: 
            row["dataset"] = "images"
            # row["url"] = "https://cs5425images.blob.core.windows.net/test-images/{}".format(row["img_name"])
            row["encd"] = json.loads(row["encd"])
            row["id"] = "{}_{}".format(row["dataset"], row["img_name"])

            try:
                container_client.upsert_item(
                    {
                        'id': row["id"],
                        'dataset': row["dataset"],
                        'url': row['url'],
                        'encd': row['encd'],
                        'img_name': row['img_name']
                    }
                )      
            except exceptions.CosmosHttpResponseError as e:
                print("Failed to insert {}, row number {}".format(row['id'], num))
                print(e)
                return
            else:
                print("Inserted {}, row number {}".format(row['id'], num))

            num += 1