In [15]:
import pandas as pd
from pydantic import BaseModel, field_validator
from datetime import datetime
from sqlalchemy import create_engine
from typing import Optional
import json
from pathlib import Path
from datetime import datetime

now = datetime.now()

In [None]:
df = pd.read_parquet("//Users//sdedeoglu//Desktop//python//case_data.parquet.gzip")

In [18]:
#connection
path = Path("//Users//sdedeoglu//Desktop//python//config.json").read_text(encoding="utf-8")
config = json.loads(path) 

kullanici = config['kullanici']
sifre = config['sifre']  
host = config['host']  
port = config['port']  
veritabani = config['veritabani'] 
engine = create_engine(f"mysql+pymysql://{kullanici}:{sifre}@{host}:{port}/{veritabani}")

 #Pydantic 

@field_validator("alan1", "alan2", mode="before") — alan değerleri Pydantic model oluşturulmadan önce bu fonksiyondan geçirilir.
mode="after" — önce Pydantic tipi uygulanır, sonra validator çalışır.
Birden fazla alanı aynı validator ile hedefleyebilirsin.
Döndürülen değer, o alanın nihai değeri olur; hata fırlatılırsa validation hatası oluşur.

In [None]:
true_set = {"yes", "true", "1", "y", "evet"}
false_set = {"no", "false", "0", "n", "hayir", "hayır"}

formats = [
"%Y-%m-%d %H:%M:%S.%f",  
"%Y-%m-%d %H:%M:%S",     
"%Y-%m-%dT%H:%M:%S.%fZ", 
"%Y-%m-%dT%H:%M:%S",     
"%d/%m/%Y %H:%M:%S",     
"%d-%m-%Y %H:%M:%S"]

status = {
"done": "success",
"ok": "success",
"paid": "success",
"fail": "failed",
"error": "failed"}

In [20]:
class UserModel(BaseModel):
    #veri tiplerinin belirlenmesi
    user_id: Optional[float]
    subscriber_id: Optional[float]
    country: str
    has_email_contact_permission: Optional[bool]
    has_phone_contact_permission: Optional[bool]

    @field_validator("has_email_contact_permission", "has_phone_contact_permission", mode="before")
    def to_bool(cls, v):
        #izin sutunlarinin bool tipine cevrilmesi
        if v is None:
            return None
        if isinstance(v, str):
            s = v.strip().lower()
            if s in true_set:
                return True
            if s in false_set:
                return False
        else:
            return None

In [21]:
class EventModel(BaseModel):
    request_id: str
    session_id: str
    funnel_id: str
    timestamp: datetime
    page_name: str
    search_query: Optional[str]
    destination_id: Optional[float]
    num_guests: Optional[float]
    #timestamp sutununda belirli formatlarda gelen tarih degerlerini datetime objesine cevirme
    @field_validator("timestamp", mode="before")
    def parse_timestamp(cls, v):
        if isinstance(v, datetime):
            return v
        if v is None:
            raise ValueError("timestamp boş olamaz")
        if isinstance(v, str):
            try:
                for fmt in formats:
                    return datetime.strptime(v, fmt)
            except ValueError:
                raise ValueError("timestamp geçersiz")

In [22]:
class HotelModel(BaseModel):
    hotel_id: int
    hotel_price: Optional[float]
    currency: str
#otel ucretlerinde gereksiz karakterlerin temizlenmesi 
    @field_validator("hotel_price", mode="before")
    def clean_price(cls, v):
        if v is None:
            return None
        if isinstance(v, str):
            v = v.replace(",", ".").replace("$", "").strip()
        try:
            return float(v)
        except (ValueError, TypeError):
            return None

In [23]:
class PaymentModel(BaseModel):
    request_id: str
    payment_status: Optional[str]
    confirmation_number: Optional[str]
#odeme sutununun belirlenen fortmata cevrilmesi
    @field_validator("payment_status", mode="before")
    def normalize_status(cls, v):
        if v is None:
            return None
        else:
            v = v.strip().lower()
        return status.get(v, v)
#icerik kontrolu
    @field_validator("payment_status")
    def check_valid_values(cls, v):
        allowed = {"pending", "success", "failed", None}
        if v not in allowed:
            raise ValueError(f"Invalid payment status: {v}")
        return v

In [24]:
class SessionModel(BaseModel):
    session_id: str
    user_id: Optional[float]
    user_agent: Optional[str]
    device_type: Optional[str]
    ip_address: Optional[str]
    utm_source: Optional[str]

    @field_validator("session_id", mode="before")
    def session_id_not_empty(cls, v):
        if v is None:
            raise ValueError("session_id boş olamaz")
        if isinstance(v, str):
            v = v.strip()
            if not v:
                raise ValueError("session_id boş olamaz")
            return v
        return str(v)

In [None]:
#Validation & Normalization

def validate_with_pydantic(df):
    users, sessions, events, hotels, payments = [], [], [], [], []

    for row in df.to_dict(orient="records"):
        try:
            users.append(UserModel(**row).model_dump(exclude_none=True))
            sessions.append(SessionModel(**row).model_dump(exclude_none=True))
            events.append(EventModel(**row).model_dump(exclude_none=True))
            hotels.append(HotelModel(**row).model_dump(exclude_none=True))
            payments.append(PaymentModel(**row).model_dump(exclude_none=True))
        except Exception as e:
            print(f"Validation error: {e}")

    return (
        pd.DataFrame(users).drop_duplicates(subset=['user_id']),
        pd.DataFrame(sessions).drop_duplicates(subset=['session_id']),
        pd.DataFrame(events),
        pd.DataFrame(hotels).drop_duplicates(subset=['hotel_id']),
        pd.DataFrame(payments)
    )

users, sessions, events, hotels, payments = validate_with_pydantic(df)

In [26]:
users["updated_Date"] = now
sessions["updated_Date"] = now
events["updated_Date"] = now
hotels["updated_Date"] = now
payments["updated_Date"] = now

In [None]:
'''users.to_sql(name='users', con=engine,if_exists='append',chunksize=10000, index=False)
sessions.to_sql(name='sessions', con=engine,if_exists='append',chunksize=10000, index=False)
events.to_sql(name='events', con=engine,if_exists='append',chunksize=10000, index=False)
hotels.to_sql(name='hotels', con=engine,if_exists='append',chunksize=10000, index=False)
payments.to_sql(name='payments', con=engine,if_exists='append',chunksize=10000, index=False)'''

350690