In [None]:
# !pip install snowflake-connector-python neo4j spacy textblob gensim transformers umap-learn scipy networkx

In [10]:
import snowflake.connector
import json
import os
from dotenv import load_dotenv
import logging
import numpy as np
import pandas as pd
import umap
from neo4j import GraphDatabase
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import (calinski_harabasz_score, davies_bouldin_score,
                             silhouette_score)
from sklearn.preprocessing import StandardScaler
from scipy.stats import entropy
from tqdm import tqdm

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

load_dotenv()

True

In [13]:
class AdvancedMultiDimensionalClustering:
    def __init__(self, snowflake_creds, neo4j_creds):
        """Initialize the clustering system"""
        self.snowflake_conn = snowflake.connector.connect(**snowflake_creds)
        self.neo4j_driver = GraphDatabase.driver(
            neo4j_creds['uri'],
            auth=(neo4j_creds['user'], neo4j_creds['password'])
        )
        self.vector_dim = 4096
        self.initialize_configuration()

    def initialize_configuration(self):
        """Initialize clustering configuration"""
        self.clustering_config = {
            'kmeans': {
                'n_clusters_range': range(5, 51, 5),
                'random_state': 42
            },
            'hierarchical': {
                'n_clusters_range': range(5, 51, 5),
                'linkage': 'ward'
            },
            'dbscan': {
                'eps_range': [0.1, 0.3, 0.5, 0.7, 0.9],
                'min_samples_range': [5, 10, 15, 20]
            }
        }

    def fetch_data(self):
        """Fetch data from Snowflake, including ATTRACTION_NAME"""
        query = """
        WITH ReviewData AS (
            SELECT
                r.REVIEW_ID,
                r.ATTRACTION_ID,
                r.REVIEW_VECTOR,
                r.RATING,
                r.CITY,
                r.STATE,
                r.COUNTRY,
                a.ATTRACTION_NAME,
                a.PRIMARY_CATEGORY,
                a.CATEGORY_ARRAY,
                a.POPULARITY_TIER,
                a.RATING_TIER,
                a.WEIGHTED_RATING,
                CASE
                    WHEN r.RATING >= 4 THEN 'POSITIVE'
                    WHEN r.RATING <= 2 THEN 'NEGATIVE'
                    ELSE 'NEUTRAL'
                END as SENTIMENT
            FROM TRAVEL_GENIE.TRANSFORMED_DATA_TRANSFORMED.ATTRACTION_REVIEWS_VECTORIZED r
            JOIN TRAVEL_GENIE.TRANSFORMED_DATA_TRANSFORMED.ATTRACTIONS a
            ON r.ATTRACTION_ID = a.ATTRACTION_ID
            WHERE r.REVIEW_VECTOR IS NOT NULL
        )
        SELECT * FROM ReviewData
        """

        logger.info("Fetching data from Snowflake...")
        cursor = self.snowflake_conn.cursor()
        cursor.execute(query)

        columns = [desc[0].lower() for desc in cursor.description]
        data = cursor.fetchall()
        df = pd.DataFrame(data, columns=columns)

        logger.info(f"Fetched {len(df)} rows of data")
        return df

    def process_vectors(self, data):
        """Process and validate vectors"""
        logger.info("Processing review vectors...")
        vectors = []
        valid_indices = []
        errors = []

        for idx, vec in enumerate(tqdm(data['review_vector'])):
            try:
                if isinstance(vec, str):
                    cleaned_vec = vec.strip().replace('\n', '').replace(' ', '')
                    vector = np.array(json.loads(cleaned_vec))

                    if len(vector) == self.vector_dim:
                        vectors.append(vector)
                        valid_indices.append(idx)
                    else:
                        errors.append(f"Wrong dimension at index {idx}: {len(vector)}")
                else:
                    errors.append(f"Invalid type at index {idx}: {type(vec)}")
            except Exception as e:
                errors.append(f"Error at index {idx}: {str(e)}")

        logger.info(f"Processed {len(vectors)} valid vectors out of {len(data)}")
        if errors:
            logger.warning(f"Encountered {len(errors)} errors. First few: {errors[:5]}")

        return np.array(vectors), valid_indices

    def create_feature_matrices(self, data, valid_indices):
        """Create comprehensive feature matrices"""
        logger.info("Creating feature matrices...")
        features = {}
        valid_data = data.iloc[valid_indices].copy()

        # Basic features
        features['category'] = pd.get_dummies(valid_data['primary_category']).values
        features['location'] = pd.get_dummies(
            valid_data[['city', 'state', 'country']].fillna('UNKNOWN')
        ).values

        # Advanced rating features
        rating_features = []
        rating_features.append(valid_data['rating'].fillna(0))
        rating_features.append(valid_data['weighted_rating'].fillna(0))
        rating_features.append(pd.get_dummies(valid_data['rating_tier']).values.T)
        rating_features.append(pd.get_dummies(valid_data['popularity_tier']).values.T)
        features['rating'] = np.vstack(rating_features).T

        # Sentiment features
        features['sentiment'] = pd.get_dummies(valid_data['sentiment']).values

        # Location popularity features
        location_stats = valid_data.groupby(['city', 'state']).agg({
            'rating': ['mean', 'count']
        }).reset_index()
        location_stats.columns = ['city', 'state', 'loc_avg_rating', 'loc_review_count']
        valid_data = valid_data.merge(location_stats, on=['city', 'state'], how='left')

        features['location_stats'] = np.vstack([
            valid_data['loc_avg_rating'].fillna(0),
            np.log1p(valid_data['loc_review_count'].fillna(0))
        ]).T

        return features

    def reduce_dimensions(self, vectors):
        """Enhanced dimension reduction"""
        logger.info("Reducing dimensions...")

        # Initial scaling
        scaled_vectors = StandardScaler().fit_transform(vectors)

        # UMAP reduction
        reducer = umap.UMAP(
            n_components=100,
            n_neighbors=30,
            min_dist=0.1,
            metric='cosine',
            random_state=42
        )
        reduced = reducer.fit_transform(scaled_vectors)

        logger.info(f"Reduced dimensions from {vectors.shape[1]} to {reduced.shape[1]}")
        return reduced

    def combine_features(self, reduced_vectors, feature_matrices):
        """Combine all features with weights"""
        combined = np.hstack([
            reduced_vectors * 0.5,  # Base vectors
            StandardScaler().fit_transform(feature_matrices['category']) * 0.2,  # Categories
            StandardScaler().fit_transform(feature_matrices['rating']) * 0.15,  # Ratings
            StandardScaler().fit_transform(feature_matrices['location']) * 0.1,  # Location
            StandardScaler().fit_transform(feature_matrices['sentiment']) * 0.05  # Sentiment
        ])
        return combined

    def perform_clustering(self, vectors):
        """Enhanced multi-method clustering with debugging"""
        logger.info(f"Starting clustering with vector shape: {vectors.shape}")
        results = {}
        scores = {}

        # Adjust clustering parameters based on data size
        n_samples = vectors.shape[0]
        logger.info(f"Number of samples: {n_samples}")

        # Dynamically set cluster ranges
        min_clusters = max(2, int(n_samples/100))  # At least 2 clusters
        max_clusters = min(int(n_samples/10), 50)  # No more than 50 or n_samples/10
        step_size = max(1, int((max_clusters - min_clusters)/5))

        self.clustering_config = {
            'kmeans': {
                'n_clusters_range': range(min_clusters, max_clusters + 1, step_size),
                'random_state': 42
            },
            'hierarchical': {
                'n_clusters_range': range(min_clusters, max_clusters + 1, step_size),
                'linkage': 'ward'
            }
        }

        logger.info(f"Cluster range: {min_clusters} to {max_clusters}, step size: {step_size}")

        # K-means clustering
        try:
            logger.info("Attempting K-means clustering...")
            best_kmeans = None
            best_kmeans_metrics = {}
            best_kmeans_score = float('-inf')

            for n_clusters in self.clustering_config['kmeans']['n_clusters_range']:
                logger.info(f"Trying K-means with {n_clusters} clusters")

                kmeans = KMeans(
                    n_clusters=n_clusters,
                    random_state=42,
                    n_init=10,
                    max_iter=300
                )

                labels = kmeans.fit_predict(vectors)

                # Ensure we have at least 2 clusters
                unique_labels = np.unique(labels)
                if len(unique_labels) < 2:
                    logger.warning(f"K-means produced only {len(unique_labels)} clusters")
                    continue

                try:
                    # Compute multiple metrics
                    silhouette = silhouette_score(vectors, labels)
                    calinski_harabasz = calinski_harabasz_score(vectors, labels)
                    davies_bouldin = davies_bouldin_score(vectors, labels)
                    logger.info(f"K-means with {n_clusters} clusters: silhouette = {silhouette}, calinski_harabasz = {calinski_harabasz}, davies_bouldin = {davies_bouldin}")

                    # Use silhouette score as the main criterion for best clustering
                    if silhouette > best_kmeans_score:
                        best_kmeans = labels
                        best_kmeans_score = silhouette
                        best_kmeans_metrics = {
                            'silhouette_score': silhouette,
                            'calinski_harabasz_score': calinski_harabasz,
                            'davies_bouldin_score': davies_bouldin
                        }
                except Exception as e:
                    logger.warning(f"Error calculating metrics for K-means: {str(e)}")
                    continue

            if best_kmeans is not None:
                results['kmeans'] = best_kmeans
                scores['kmeans'] = best_kmeans_metrics
                logger.info(f"K-means clustering successful with silhouette score: {best_kmeans_score}")

        except Exception as e:
            logger.error(f"Error in K-means clustering: {str(e)}")

        # Hierarchical clustering
        try:
            logger.info("Attempting hierarchical clustering...")
            best_hierarchical = None
            best_hierarchical_metrics = {}
            best_hierarchical_score = float('-inf')

            for n_clusters in self.clustering_config['hierarchical']['n_clusters_range']:
                logger.info(f"Trying hierarchical with {n_clusters} clusters")

                hierarchical = AgglomerativeClustering(
                    n_clusters=n_clusters,
                    linkage='ward'
                )

                labels = hierarchical.fit_predict(vectors)

                # Ensure we have at least 2 clusters
                unique_labels = np.unique(labels)
                if len(unique_labels) < 2:
                    logger.warning(f"Hierarchical produced only {len(unique_labels)} clusters")
                    continue

                try:
                    # Compute multiple metrics
                    silhouette = silhouette_score(vectors, labels)
                    calinski_harabasz = calinski_harabasz_score(vectors, labels)
                    davies_bouldin = davies_bouldin_score(vectors, labels)
                    logger.info(f"Hierarchical with {n_clusters} clusters: silhouette = {silhouette}, calinski_harabasz = {calinski_harabasz}, davies_bouldin = {davies_bouldin}")

                    # Use silhouette score as the main criterion for best clustering
                    if silhouette > best_hierarchical_score:
                        best_hierarchical = labels
                        best_hierarchical_score = silhouette
                        best_hierarchical_metrics = {
                            'silhouette_score': silhouette,
                            'calinski_harabasz_score': calinski_harabasz,
                            'davies_bouldin_score': davies_bouldin
                        }
                except Exception as e:
                    logger.warning(f"Error calculating metrics for hierarchical: {str(e)}")
                    continue

            if best_hierarchical is not None:
                results['hierarchical'] = best_hierarchical
                scores['hierarchical'] = best_hierarchical_metrics
                logger.info(f"Hierarchical clustering successful with silhouette score: {best_hierarchical_score}")

        except Exception as e:
            logger.error(f"Error in hierarchical clustering: {str(e)}")

        # If both methods fail, try DBSCAN as fallback
        if not results:
            try:
                logger.info("Attempting DBSCAN as fallback...")
                dbscan = DBSCAN(
                    eps=0.5,
                    min_samples=5,
                    metric='euclidean'
                )
                labels = dbscan.fit_predict(vectors)

                # Check if DBSCAN produced valid clusters
                unique_labels = np.unique(labels[labels >= 0])
                if len(unique_labels) >= 2:
                    results['dbscan'] = labels
                    try:
                        # Compute multiple metrics
                        silhouette = silhouette_score(vectors, labels, metric='euclidean')
                        calinski_harabasz = calinski_harabasz_score(vectors, labels)
                        davies_bouldin = davies_bouldin_score(vectors, labels)
                        scores['dbscan'] = {
                            'silhouette_score': silhouette,
                            'calinski_harabasz_score': calinski_harabasz,
                            'davies_bouldin_score': davies_bouldin
                        }
                        logger.info("DBSCAN clustering successful")
                    except Exception as e:
                        logger.warning(f"Error calculating metrics for DBSCAN: {str(e)}")
                        scores['dbscan'] = {
                            'silhouette_score': None,
                            'calinski_harabasz_score': None,
                            'davies_bouldin_score': None
                        }
                else:
                    logger.warning("DBSCAN failed to produce valid clusters")

            except Exception as e:
                logger.error(f"Error in DBSCAN clustering: {str(e)}")

        # Final check
        if not results:
            # If all methods fail, create simple binary clustering using median split
            logger.warning("All clustering methods failed, using median split...")
            try:
                # Use first principal component for splitting
                pca = PCA(n_components=1)
                projected = pca.fit_transform(vectors)
                median = np.median(projected)
                labels = (projected > median).astype(int).flatten()

                results['binary'] = labels
                scores['binary'] = {
                    'silhouette_score': None,
                    'calinski_harabasz_score': None,
                    'davies_bouldin_score': None
                }  # No real score for this method
                logger.info("Created binary split as last resort")
            except Exception as e:
                logger.error(f"Error creating binary split: {str(e)}")
                raise ValueError("No clustering method succeeded")

        logger.info(f"Clustering completed with methods: {list(results.keys())}")
        return results, scores

    def analyze_clusters(self, clustering_results, data, valid_indices):
        """Enhanced cluster analysis with comprehensive metrics"""
        logger.info("Analyzing clusters...")
        analysis = {}

        # Get valid data
        valid_data = data.iloc[valid_indices].reset_index(drop=True)

        for method, labels in clustering_results.items():
            try:
                logger.info(f"Analyzing {method} clusters...")
                clusters_analysis = {}

                # Get unique valid labels (excluding noise points marked as -1)
                valid_labels = set(labels[labels >= 0])
                logger.info(f"Found {len(valid_labels)} valid clusters for {method}")

                for label in valid_labels:
                    # Get data for this cluster
                    mask = labels == label
                    cluster_data = valid_data[mask]

                    if len(cluster_data) == 0:
                        logger.warning(f"Empty cluster found for {method}, label {label}")
                        continue

                    # Initialize cluster info dictionary
                    cluster_info = {}

                    # Basic cluster statistics
                    cluster_info['size'] = int(len(cluster_data))

                    # Rating analysis
                    cluster_info['rating_analysis'] = {
                        'mean': float(cluster_data['rating'].mean()),
                        'median': float(cluster_data['rating'].median()),
                        'std': float(cluster_data['rating'].std()),
                        'min': float(cluster_data['rating'].min()),
                        'max': float(cluster_data['rating'].max()),
                        'weighted_avg': float(cluster_data['weighted_rating'].mean())
                    }

                    # Category distribution
                    primary_category_counts = cluster_data['primary_category'].value_counts()
                    cluster_info['category_analysis'] = {
                        'primary_categories': primary_category_counts.to_dict(),
                        'category_entropy': float(entropy(primary_category_counts.values)),
                        'dominant_category': primary_category_counts.idxmax()
                    }

                    # Sentiment distribution
                    sentiment_counts = cluster_data['sentiment'].value_counts()
                    cluster_info['sentiment_analysis'] = {
                        'distribution': sentiment_counts.to_dict(),
                        'sentiment_entropy': float(entropy(sentiment_counts.values)),
                        'dominant_sentiment': sentiment_counts.idxmax()
                    }

                    # Location analysis
                    cluster_info['location_analysis'] = {
                        'cities': {
                            'distribution': cluster_data['city'].value_counts().head(10).to_dict(),
                            'unique_count': int(cluster_data['city'].nunique()),
                            'top_city': cluster_data['city'].mode().iloc[0]
                        },
                        'states': {
                            'distribution': cluster_data['state'].value_counts().head(10).to_dict(),
                            'unique_count': int(cluster_data['state'].nunique()),
                            'top_state': cluster_data['state'].mode().iloc[0]
                        },
                        'countries': {
                            'distribution': cluster_data['country'].value_counts().head(5).to_dict(),
                            'unique_count': int(cluster_data['country'].nunique()),
                            'top_country': cluster_data['country'].mode().iloc[0]
                        }
                    }

                    # Popularity metrics
                    popularity_tier_counts = cluster_data['popularity_tier'].value_counts()
                    cluster_info['popularity_analysis'] = {
                        'tier_distribution': popularity_tier_counts.to_dict(),
                        'dominant_tier': popularity_tier_counts.idxmax(),
                        'avg_weighted_rating': float(cluster_data['weighted_rating'].mean())
                    }

                    # Attraction analysis (Include ATTRACTION_NAME)
                    attraction_counts = cluster_data['attraction_name'].value_counts()
                    cluster_info['attraction_analysis'] = {
                        'attractions': attraction_counts.to_dict(),
                        'unique_count': int(cluster_data['attraction_name'].nunique()),
                        'top_attractions': attraction_counts.head(10).to_dict()
                    }

                    # Cluster quality metrics
                    cluster_info['quality_metrics'] = {
                        'category_cohesion': float(1 - entropy(primary_category_counts.values / primary_category_counts.values.sum())),
                        'sentiment_cohesion': float(1 - entropy(sentiment_counts.values / sentiment_counts.values.sum())),
                        'rating_consistency': float(1 / (1 + cluster_data['rating'].std())),
                        'location_diversity': float(entropy(cluster_data['city'].value_counts(normalize=True).values))
                    }

                    # Calculate overall cluster quality score
                    quality_metrics = cluster_info['quality_metrics']
                    cluster_info['overall_quality_score'] = float(np.mean([
                        quality_metrics['category_cohesion'],
                        quality_metrics['sentiment_cohesion'],
                        quality_metrics['rating_consistency'],
                        1 - (quality_metrics['location_diversity'] / np.log(max(2, cluster_data['city'].nunique())))
                    ]))

                    # Assign cluster info to clusters_analysis
                    clusters_analysis[label] = cluster_info

                if clusters_analysis:
                    # Compute summary statistics
                    sizes = [info['size'] for info in clusters_analysis.values()]
                    quality_scores = [info['overall_quality_score'] for info in clusters_analysis.values()]
                    analysis[method] = {
                        'clusters': clusters_analysis,
                        'summary': {
                            'total_clusters': len(clusters_analysis),
                            'total_points': sum(sizes),
                            'avg_cluster_size': float(np.mean(sizes)),
                            'std_cluster_size': float(np.std(sizes)),
                            'min_cluster_size': min(sizes),
                            'max_cluster_size': max(sizes),
                            'avg_quality_score': float(np.mean(quality_scores))
                        }
                    }
                    logger.info(f"Completed analysis for {method} clustering")
                else:
                    logger.warning(f"No valid clusters found for {method}")

            except Exception as e:
                logger.error(f"Error analyzing {method} clusters: {str(e)}")
                continue

        if not analysis:
            raise ValueError("No clusters could be analyzed")

        logger.info("Cluster analysis completed successfully")
        return analysis

    def calculate_cluster_similarity(self, cluster1_info, cluster2_info):
        """Calculate similarity between two clusters using cosine similarity"""
        # Extract centroid vectors
        centroid1 = cluster1_info['centroid_vector']
        centroid2 = cluster2_info['centroid_vector']

        # Compute cosine similarity
        dot_product = np.dot(centroid1, centroid2)
        norm1 = np.linalg.norm(centroid1)
        norm2 = np.linalg.norm(centroid2)
        if norm1 == 0 or norm2 == 0:
            return 0.0
        similarity = dot_product / (norm1 * norm2)
        return similarity

    def update_cluster_centroids(self, clustering_results, combined_features):
        """Compute and store centroid vectors for each cluster"""
        logger.info("Computing cluster centroids...")
        centroids = {}
        for method, labels in clustering_results.items():
            centroids[method] = {}
            unique_labels = set(labels[labels >= 0])
            for label in unique_labels:
                mask = labels == label
                cluster_vectors = combined_features[mask]
                centroid = np.mean(cluster_vectors, axis=0)
                centroids[method][label] = centroid
        return centroids

    def clean_neo4j(self):
        """Clean existing cluster data from Neo4j"""
        logger.info("Cleaning existing cluster data...")
        with self.neo4j_driver.session() as session:
            session.run("MATCH ()-[r:IN_CLUSTER]->() DELETE r")
            session.run("MATCH ()-[r:RELATED_TO]->() DELETE r")
            session.run("MATCH (c:Cluster) DELETE c")
            session.run("MATCH (a:Attraction) DELETE a")
            logger.info("Existing cluster data cleaned")

    def update_neo4j(self, clustering_results, analysis, data, valid_indices):
        """Update Neo4j with clustering results, including attractions"""
        import json  # Ensure this is imported
        logger.info("Updating Neo4j with results...")
        self.clean_neo4j()

        valid_data = data.iloc[valid_indices].reset_index(drop=True)

        def convert_numpy_types(obj):
            if isinstance(obj, (np.integer, np.int32, np.int64)):
                return int(obj)
            elif isinstance(obj, (np.floating, np.float32, np.float64)):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            elif isinstance(obj, dict):
                return {k: convert_numpy_types(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                return [convert_numpy_types(v) for v in obj]
            else:
                return obj

        with self.neo4j_driver.session() as session:
            # Create cluster nodes
            for method, labels in clustering_results.items():
                for label in set(labels[labels >= 0]):
                    cluster_info = analysis[method]['clusters'][label]

                    session.run("""
                    CREATE (c:Cluster {
                        cluster_id: $cluster_id,
                        method: $method,
                        size: $size,
                        avg_rating: $avg_rating,
                        categories: $categories,
                        sentiments: $sentiment_dist,
                        locations: $locations,
                        metrics: $metrics,
                        popularity: $popularity,
                        attractions: $attractions,
                        created_at: datetime()
                    })
                    """,
                    cluster_id=f"{method}_{label}",
                    method=method,
                    size=int(cluster_info['size']),
                    avg_rating=float(cluster_info['rating_analysis']['mean']),
                    categories=json.dumps(convert_numpy_types(cluster_info['category_analysis']['primary_categories'])),
                    sentiment_dist=json.dumps(convert_numpy_types(cluster_info['sentiment_analysis']['distribution'])),
                    locations=json.dumps(convert_numpy_types(cluster_info['location_analysis'])),
                    metrics=json.dumps(convert_numpy_types(cluster_info['quality_metrics'])),
                    popularity=json.dumps(convert_numpy_types(cluster_info['popularity_analysis'])),
                    attractions=json.dumps(convert_numpy_types(cluster_info['attraction_analysis']['top_attractions'])),
                    )

            # Create attraction nodes and relationships
            logger.info("Creating attraction nodes and relationships...")
            for idx, row in valid_data.iterrows():
                attraction_id = row['attraction_id']
                attraction_name = row['attraction_name']
                cluster_labels = {}
                for method, labels in clustering_results.items():
                    label = labels[idx]
                    if label >= 0:
                        cluster_id = f"{method}_{label}"
                        cluster_labels[method] = cluster_id

                        # Create attraction node if not exists
                        session.run("""
                        MERGE (a:Attraction {attraction_id: $attraction_id})
                        SET a.name = $attraction_name
                        """,
                        attraction_id=attraction_id,
                        attraction_name=attraction_name
                        )

                        # Create relationship to cluster
                        session.run("""
                        MATCH (a:Attraction {attraction_id: $attraction_id})
                        MATCH (c:Cluster {cluster_id: $cluster_id})
                        MERGE (a)-[:IN_CLUSTER {
                            method: $method,
                            created_at: datetime()
                        }]->(c)
                        """,
                        attraction_id=attraction_id,
                        cluster_id=cluster_id,
                        method=method
                        )

    def create_cluster_relationships(self, session, clustering_results, analysis):
        """Create relationships between clusters using cosine similarity"""
        logger.info("Creating cluster relationships...")

        for method, labels in clustering_results.items():
            unique_labels = set(labels[labels >= 0])

            clusters_info = analysis[method]['clusters']
            for label1 in unique_labels:
                cluster1_info = clusters_info[label1]

                for label2 in unique_labels:
                    if label1 < label2:  # Only process each pair once
                        cluster2_info = clusters_info[label2]

                        similarity = self.calculate_cluster_similarity(
                            cluster1_info,
                            cluster2_info
                        )

                        if similarity > 0.3:
                            session.run("""
                            MATCH (c1:Cluster {cluster_id: $id1})
                            MATCH (c2:Cluster {cluster_id: $id2})
                            CREATE (c1)-[:RELATED_TO {
                                similarity: $similarity,
                                method: $method,
                                created_at: datetime()
                            }]->(c2)
                            """,
                            id1=f"{method}_{label1}",
                            id2=f"{method}_{label2}",
                            similarity=float(similarity),
                            method=method
                            )

    def process_and_cluster(self):
        """Main processing method that calls everything in order"""
        try:
            # Fetch and process data
            data = self.fetch_data()
            vectors, valid_indices = self.process_vectors(data)

            # Feature engineering
            features = self.create_feature_matrices(data, valid_indices)
            reduced_vectors = self.reduce_dimensions(vectors)
            combined_features = self.combine_features(reduced_vectors, features)

            # Clustering and analysis
            clustering_results, scores = self.perform_clustering(combined_features)

            # Update cluster centroids
            centroids = self.update_cluster_centroids(clustering_results, combined_features)

            # Incorporate centroids into analysis
            analysis = self.analyze_clusters(clustering_results, data, valid_indices)
            for method in analysis.keys():
                for label in analysis[method]['clusters'].keys():
                    analysis[method]['clusters'][label]['centroid_vector'] = centroids[method][label]

            # Neo4j updates
            self.update_neo4j(clustering_results, analysis, data, valid_indices)

            # Create relationships in Neo4j
            with self.neo4j_driver.session() as session:
                self.create_cluster_relationships(session, clustering_results, analysis)

            return {
                'clustering_results': clustering_results,
                'scores': scores,
                'analysis': analysis,
                'stats': {
                    'total_vectors': len(vectors),
                    'valid_vectors': len(valid_indices),
                    'dimensions': {
                        'original': self.vector_dim,
                        'reduced': reduced_vectors.shape[1],
                        'final': combined_features.shape[1]
                    }
                }
            }

        except Exception as e:
            logger.error(f"Error during clustering: {str(e)}")
            raise
        finally:
            self.snowflake_conn.close()
            self.neo4j_driver.close()

In [14]:
# Set up credentials
snowflake_creds = {
      'user': os.getenv('SNOWFLAKE_USER'),
      'password': os.getenv('SNOWFLAKE_PASSWORD'),
      'account': os.getenv('SNOWFLAKE_ACCOUNT'),
      'warehouse': os.getenv('SNOWFLAKE_WAREHOUSE'),
      'database': os.getenv('SNOWFLAKE_DATABASE'),
      'schema': os.getenv('SNOWFLAKE_SCHEMA')
}

neo4j_creds = {
     'uri': os.getenv('NEO4J_URI'),
     'user': os.getenv('NEO4J_USERNAME'),
     'password': os.getenv('NEO4J_PASSWORD')
}

# Initialize and run clustering
clustering = AdvancedMultiDimensionalClustering(snowflake_creds, neo4j_creds)
results = clustering.process_and_cluster()

# Print results
print("=== Clustering Results Summary ===")
print(f"Total vectors processed: {results['stats']['total_vectors']}")
print(f"Valid vectors used: {results['stats']['valid_vectors']}")

# Clustering Scores
for method, metrics in results['scores'].items():
    print(f"\n{method.upper()} Clustering Scores:")
    for metric_name, metric_value in metrics.items():
        if metric_value is not None:
            print(f"{metric_name}: {metric_value:.4f}")
        else:
            print(f"{metric_name}: Not computed")

# Cluster Sizes
print("\nCluster Sizes:")
for method, method_info in results['analysis'].items():
    print(f"\n{method.upper()}:")
    clusters = method_info['clusters']
    sizes = [info['size'] for info in clusters.values()]
    print(f"Number of clusters: {len(clusters)}")
    print(f"Average cluster size: {sum(sizes)/len(sizes):.2f}")


INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.12.3, Python Version: 3.12.7, Platform: Windows-11-10.0.22631-SP0
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
INFO:snowflake.connector.connection:This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
INFO:__main__:Fetching data from Snowflake...
INFO:snowflake.connector.cursor:Number of results in first chunk: 7
INFO:__main__:Fetched 5635 rows of data
INFO:__main__:Processing review vectors...
100%|██████████| 5635/5635 [00:08<00:00, 694.60it/s]
INFO:__main__:Processed 5635 valid vectors out of 5635
INFO:__main__:Creating feature matrices...
INFO:__main__:Reducing dimensions...
  warn(
INFO:__main__:Reduced dimensions from 4096 to 100
INFO:__main__:Starting clustering with vector shape: (5635, 142)

=== Clustering Results Summary ===
Total vectors processed: 5635
Valid vectors used: 5635

DBSCAN Clustering Scores:
silhouette_score: 0.3031
calinski_harabasz_score: 214.2646
davies_bouldin_score: 1.2695

Cluster Sizes:

DBSCAN:
Number of clusters: 118
Average cluster size: 44.12
