In [10]:
import pandas as pd
from sqlalchemy import text
from typing import List, Dict, Any
from datetime import datetime
import numpy as np
from pathlib import Path
import json
from sqlalchemy import create_engine
import sys
sys.path.append('/Users/sdedeoglu/Desktop/python/raw_logs')
from log_pipeline import ParquetReader, ValidatorProcessor, UpsertWriter

In [11]:
class DataQualityChecker:
    def __init__(self, engine):
        self.engine = engine
        self.errors = []
        
    def check_null_values(self, table_name: str, critical_columns: List[str]) -> bool:
        """Kritik sütunlarda null değer kontrolü"""
        try:
            null_conditions = " OR ".join([f"{col} IS NULL" for col in critical_columns])
            query = f"SELECT COUNT(*) as null_count FROM {table_name} WHERE {null_conditions}"
            
            with self.engine.connect() as conn:
                result = conn.execute(text(query)).fetchone()
                null_count = result[0]
                
            if null_count > 0:
                error_msg = f" {table_name}: Found {null_count} null values in critical columns {critical_columns}"
                self.errors.append(error_msg)
                print(error_msg)
                return False
            else:
                print(f" {table_name}: No null values in critical columns")
                return True
                
        except Exception as e:
            error_msg = f" Error checking nulls in {table_name}: {str(e)}"
            self.errors.append(error_msg)
            print(error_msg)
            return False
    
    def check_duplicates(self, table_name: str, unique_columns: List[str]) -> bool:
        """Duplicate record kontrolü"""
        try:
            columns_str = ", ".join(unique_columns)
            query = f"""
            SELECT {columns_str}, COUNT(*) as cnt 
            FROM {table_name} 
            GROUP BY {columns_str}
            HAVING COUNT(*) > 1
            LIMIT 10
            """
            
            with self.engine.connect() as conn:
                df = pd.read_sql(query, conn)
                
            if not df.empty:
                error_msg = f" {table_name}: Found {len(df)} duplicate records"
                self.errors.append(error_msg)
                print(error_msg)
                print(f"Sample duplicates:\n{df.head()}")
                return False
            else:
                print(f"{table_name}: No duplicate records found")
                return True
                
        except Exception as e:
            error_msg = f" Error checking duplicates in {table_name}: {str(e)}"
            self.errors.append(error_msg)
            print(error_msg)
            return False
    
    def check_data_freshness(self, table_name: str, date_column: str = "updated_Date") -> bool:
        """Bugünün verisi var mı kontrolü"""
        try:
            query = f"""
            SELECT COUNT(*) as today_count 
            FROM {table_name} 
            WHERE DATE({date_column}) = CURDATE()
            """
            
            with self.engine.connect() as conn:
                result = conn.execute(text(query)).fetchone()
                today_count = result[0]
                
            if today_count == 0:
                error_msg = f"{table_name}: No fresh data found for today"
                self.errors.append(error_msg)
                print(error_msg)
                return False
            else:
                print(f" {table_name}: Found {today_count} fresh records for today")
                return True
                
        except Exception as e:
            error_msg = f"Error checking freshness in {table_name}: {str(e)}"
            self.errors.append(error_msg)
            print(error_msg)
            return False
    
    def check_schema_validation(self, df: pd.DataFrame, required_columns: List[str]) -> bool:
        """DataFrame schema kontrolü"""
        try:
            missing_cols = [col for col in required_columns if col not in df.columns]
            
            if missing_cols:
                error_msg = f" Schema validation failed: Missing columns {missing_cols}"
                self.errors.append(error_msg)
                print(error_msg)
                return False
            else:
                print(f" Schema validation passed: All required columns present")
                return True
                
        except Exception as e:
            error_msg = f" Schema validation error: {str(e)}"
            self.errors.append(error_msg)
            print(error_msg)
            return False
    
    def check_data_ranges(self, table_name: str, column_ranges: Dict[str, Dict]) -> bool:
        """Veri aralık kontrolü"""
        try:
            all_passed = True
            
            for column, ranges in column_ranges.items():
                min_val = ranges.get('min')
                max_val = ranges.get('max')
                
                conditions = []
                if min_val is not None:
                    conditions.append(f"{column} < {min_val}")
                if max_val is not None:
                    conditions.append(f"{column} > {max_val}")
                
                if conditions:
                    query = f"SELECT COUNT(*) as invalid_count FROM {table_name} WHERE {' OR '.join(conditions)}"
                    
                    with self.engine.connect() as conn:
                        result = conn.execute(text(query)).fetchone()
                        invalid_count = result[0]
                    
                    if invalid_count > 0:
                        error_msg = f" {table_name}.{column}: Found {invalid_count} values outside range [{min_val}, {max_val}]"
                        self.errors.append(error_msg)
                        print(error_msg)
                        all_passed = False
                    else:
                        print(f" {table_name}.{column}: All values within valid range")
                        
            return all_passed
            
        except Exception as e:
            error_msg = f" Error checking ranges in {table_name}: {str(e)}"
            self.errors.append(error_msg)
            print(error_msg)
            return False
    
    def run_all_checks(self) -> bool:
        """Tüm kontrolleri çalıştır"""
        print("🔍 Starting Data Quality Checks...")
        print("=" * 50)
        
        # Define checks for each table
        checks = [
            # Users table checks
            self.check_null_values("users", ["user_id"]),
            self.check_duplicates("users", ["user_id"]),
            self.check_data_freshness("users"),
            
            # Sessions table checks
            self.check_null_values("sessions", ["session_id", "user_id"]),
            self.check_duplicates("sessions", ["session_id"]),
            self.check_data_freshness("sessions"),
            
            # Events table checks
            self.check_null_values("events", ["request_id", "session_id", "hotel_id", "funnel_id"]),
            self.check_duplicates("events", ["request_id", "session_id", "hotel_id", "funnel_id"]),
            self.check_data_freshness("events"),
            
            # Hotels table checks
            self.check_null_values("hotels", ["hotel_id"]),
            self.check_duplicates("hotels", ["hotel_id"]),
            self.check_data_ranges("hotels", {"hotel_price": {"min": 0, "max": 50000}}),
            
            # Payments table checks
            self.check_null_values("payments", ["request_id"]),
            self.check_duplicates("payments", ["request_id"]),
        ]
        
        # Run all checks
        all_passed = all(checks)
        
        print("=" * 50)
        if all_passed:
            print("ALL DATA QUALITY CHECKS PASSED!")
        else:
            print(f" {len(self.errors)} DATA QUALITY ISSUES FOUND:")
            for error in self.errors:
                print(f"  • {error}")
                
        return all_passed
    
    def get_report(self) -> Dict[str, Any]:
        """Data quality raporu döndür"""
        return {
            "timestamp": datetime.now().isoformat(),
            "total_errors": len(self.errors),
            "errors": self.errors,
            "status": "PASSED" if len(self.errors) == 0 else "FAILED"
        }

In [12]:
class LogPipelineWithQuality:
    def __init__(self, parquet_path: str, engine):
        # Import etmek için gerekli sınıfları import et
        self.reader = ParquetReader(parquet_path)
        self.validator = ValidatorProcessor()
        self.writer = UpsertWriter(engine)
        self.quality_checker = DataQualityChecker(engine)
        self.now = datetime.now()

    def clean_nan_values(self, df):
        """NaN değerlerini MySQL uyumlu hale getir"""
        # NaN değerlerini None ile değiştir
        df_cleaned = df.copy()
        
        # Numeric sütunlarda NaN'ları None ile değiştir
        for col in df_cleaned.columns:
            if df_cleaned[col].dtype in ['float64', 'float32']:
                df_cleaned[col] = df_cleaned[col].replace({np.nan: None})
        
        # Object tipindeki sütunlarda da NaN'ları kontrol et
        df_cleaned = df_cleaned.where(pd.notnull(df_cleaned), None)
        
        return df_cleaned

    def run_once(self):
        print("Starting Log Pipeline with Data Quality...")
        # 2. Data processing
        df = self.reader.read_all()
        df.dropna(subset=["user_id", "session_id", "hotel_id", "request_id", "funnel_id"], inplace=True)
        
        # 3. Schema validation
        required_columns = ["user_id", "session_id", "hotel_id", "request_id", "funnel_id"]
        if not self.quality_checker.check_schema_validation(df, required_columns):
            raise ValueError("Schema validation failed!")
        
        # 4. Validate and transform
        users, sessions, events, hotels, payments = self.validator.validate(df)

        # 5. Add updated_Date and handle NaN values for MySQL compatibility
        for tbl in (users, sessions, events, hotels, payments):
            if not tbl.empty:
                tbl["updated_Date"] = self.now
                # NaN değerlerini MySQL uyumlu hale getir
                tbl = self.clean_nan_values(tbl)

        # 6. Write to database
        print("\n Writing to db")
        try:
            self.writer.upsert_df(self.clean_nan_values(users), "users")
            self.writer.upsert_df(self.clean_nan_values(sessions), "sessions")
            self.writer.upsert_df(self.clean_nan_values(events), "events")
            self.writer.upsert_df(self.clean_nan_values(hotels), "hotels")
            self.writer.upsert_df(self.clean_nan_values(payments), "payments")
        except Exception as e:
            print(f" Database write error: {str(e)}")
            raise
        
        # 7. Post-processing data quality check
        print("\n Running post-processing data quality checks...")
        quality_passed = self.quality_checker.run_all_checks()
        
        if not quality_passed:
            print("\n  DATA QUALITY ISSUES DETECTED!")
            report = self.quality_checker.get_report()
            print(f"Quality Report: {report}")
        
        print("\n Pipeline completed")
        return quality_passed

In [13]:
# Test pipeline
try:
    # Config yükle
    cfg_text = Path("/Users/sdedeoglu/Desktop/python/config.json").read_text(encoding="utf-8")
    cfg = json.loads(cfg_text)
    engine = create_engine(f"mysql+pymysql://{cfg['kullanici']}:{cfg['sifre']}@{cfg['host']}:{cfg['port']}/{cfg['veritabani']}")

    # Pipeline çalıştır (data quality dahil)
    pipeline = LogPipelineWithQuality("/Users/sdedeoglu/Desktop/python/case_data.parquet.gzip", engine)
    quality_passed = pipeline.run_once()

    # Standalone data quality check
    print("\n" + "="*60)
    print(" STANDALONE DATA QUALITY CHECK")
    print("="*60)

    quality_checker = DataQualityChecker(engine)
    standalone_result = quality_checker.run_all_checks()

    print(f"\nPipeline Quality Result: {' PASSED' if quality_passed else ' FAILED'}")
    print(f"Standalone Quality Result: {' PASSED' if standalone_result else ' FAILED'}")
    
except Exception as e:
    print(f" Pipeline Error: {str(e)}")
    import traceback
    traceback.print_exc()

Starting Log Pipeline with Data Quality...
 Schema validation passed: All required columns present

 Writing to db

 Running post-processing data quality checks...
🔍 Starting Data Quality Checks...
 users: No null values in critical columns
users: No duplicate records found
 users: Found 4001 fresh records for today
 sessions: No null values in critical columns
sessions: No duplicate records found
 sessions: Found 99971 fresh records for today
 events: No null values in critical columns
events: No duplicate records found
 events: Found 350590 fresh records for today
 hotels: No null values in critical columns
hotels: No duplicate records found
 hotels.hotel_price: All values within valid range
 payments: No null values in critical columns
payments: No duplicate records found
ALL DATA QUALITY CHECKS PASSED!

 Pipeline completed

 STANDALONE DATA QUALITY CHECK
🔍 Starting Data Quality Checks...
 users: No null values in critical columns
users: No duplicate records found
 users: Found 4001