In [11]:
import shutil
import pandas as pd
from pydantic import BaseModel, field_validator
from datetime import datetime
from sqlalchemy import create_engine
from typing import Optional
import json
from pathlib import Path
# Copy the file directly since it's not actually gzipped
shutil.copy("case_data.parquet.gzip", "case_data.parquet")
df = pd.read_parquet("case_data.parquet")

In [None]:
p = Path("//Users//sdedeoglu//Desktop//python//config.json")

text = p.read_text(encoding="utf-8")
data = json.loads(text) 

kullanici = data['kullanici']
sifre = data['sifre']  
host = data['host']  
port = data['port']  
veritabani = data['veritabani'] 
engine = create_engine(f"mysql+pymysql://{kullanici}:{sifre}@{host}:{port}/{veritabani}")

In [16]:
daaf = pd.read_sql("SELECT * FROM logs.ornek_tablo;", engine)

In [5]:
 #=== 1️⃣ Pydantic Modelleri (V2 Style) ===

class UserModel(BaseModel):
    user_id: float
    subscriber_id: Optional[float]
    country: Optional[str]
    has_email_contact_permission: Optional[bool]
    has_phone_contact_permission: Optional[bool]

    @field_validator("has_email_contact_permission", "has_phone_contact_permission", mode="before")
    def to_bool(cls, v):
        if v is None:
            return None
        if isinstance(v, str):
            return v.lower() in ["yes", "true", "1"]
        return bool(v)


class SessionModel(BaseModel):
    session_id: str
    user_id: Optional[float]
    user_agent: Optional[str]
    device_type: Optional[str]
    ip_address: Optional[str]
    utm_source: Optional[str]


class EventModel(BaseModel):
    request_id: str
    session_id: str
    funnel_id: str
    timestamp: datetime
    page_name: str
    search_query: Optional[str]
    destination_id: Optional[float]
    num_guests: Optional[float]


class HotelModel(BaseModel):
    hotel_id: int
    hotel_price: Optional[float]
    currency: Optional[str]

    @field_validator("hotel_price", mode="before")
    def clean_price(cls, v):
        if v is None:
            return None
        if isinstance(v, str):
            v = v.replace(",", ".").replace("$", "").strip()
        try:
            return float(v)
        except ValueError:
            return None


class PaymentModel(BaseModel):
    request_id: str
    payment_status: Optional[str]
    confirmation_number: Optional[str]

    @field_validator("payment_status", mode="before")
    def normalize_status(cls, v):
        if v is None:
            return None
        v = v.strip().lower()
        mapping = {
            "success": "completed",
            "done": "completed",
            "ok": "completed",
            "paid": "completed",
            "fail": "failed",
            "error": "failed"
        }
        return mapping.get(v, v)

    @field_validator("payment_status")
    def check_valid_values(cls, v):
        allowed = {"pending", "completed", "failed", "refunded", None}
        if v not in allowed:
            raise ValueError(f"Invalid payment status: {v}")
        return v

In [15]:
# === 3️⃣ Validation & Normalization ===

def validate_with_pydantic(df):
    users, sessions, events, hotels, payments = [], [], [], [], []

    for row in df.to_dict(orient="records"):
        try:
            users.append(UserModel(**row).model_dump(exclude_none=True))
            sessions.append(SessionModel(**row).model_dump(exclude_none=True))
            events.append(EventModel(**row).model_dump(exclude_none=True))
            hotels.append(HotelModel(**row).model_dump(exclude_none=True))
            payments.append(PaymentModel(**row).model_dump(exclude_none=True))
        except Exception as e:
            print(f"Validation error: {e}")

    return (
        pd.DataFrame(users).drop_duplicates(subset=['user_id']),
        pd.DataFrame(sessions).drop_duplicates(subset=['session_id']),
        pd.DataFrame(events),
        pd.DataFrame(hotels).drop_duplicates(subset=['hotel_id']),
        pd.DataFrame(payments)
    )

users, sessions, events, hotels, payments = validate_with_pydantic(df)

          ┌───────────────────┐
          │      USERS        │
          │-------------------│
          │ user_id (PK)      │
          │ subscriber_id      │
          │ country            │
          │ email_permission   │
          │ phone_permission   │
          └─────────┬─────────┘
                    │ 1-to-many
                    ▼
          ┌───────────────────┐
          │     SESSIONS      │
          │-------------------│
          │ session_id (PK)   │
          │ user_id (FK)      │
          │ device_type       │
          │ user_agent        │
          │ ip_address        │
          │ utm_source        │
          └─────────┬─────────┘
                    │ 1-to-many
                    ▼
          ┌───────────────────┐
          │      EVENTS       │
          │-------------------│
          │ request_id (PK)   │
          │ session_id (FK)   │
          │ hotel_id (FK)     │
          │ funnel_id         │
          │ page_name         │
          │ timestamp         │
          │ search_query      │
          │ destination_id    │
          └─────────┬─────────┘
                    │ 1-to-one
                    ▼
          ┌───────────────────┐
          │     PAYMENTS      │
          │-------------------│
          │ confirmation_num  │
          │ request_id (FK)   │
          │ payment_status    │
          └───────────────────┘

          ┌───────────────────┐
          │      HOTELS       │
          │-------------------│
          │ hotel_id (PK)     │
          │ hotel_price       │
          │ currency          │
          └───────────────────┘

In [18]:
payments

Unnamed: 0,request_id,payment_status,confirmation_number
0,AmyAWHLkTnLIOLNZ,,
1,CfqNJ2Ejg6LvRMKi,,
2,nxC8LGnrNofisZwa,,
3,63tXuafqdH7w5vet,,
4,4Q5ZPnhIcPJXz1eU,,
...,...,...,...
350685,a6J4UVq7Hn7VO5UG,,
350686,e8PJnJEJNTj5h9g9,,
350687,Y6db20pQs3otMNrF,,
350688,6BOeeLgI0xW04279,pending,


veriler olusturuldu. db aktarılması lazım merge into kısmını göstermek için.

In [None]:
hotels_df.to_sql(name='hotels', con=engine,if_exists='append', index=False)

101

In [None]:

# === 2️⃣ Örnek Veri (Senin Verin) ===

sample_data = [
    {
        'request_id': 'AmyAWHLkTnLIOLNZ',
        'funnel_id': 'afe5v8jbLSw6',
        'session_id': 'LE9SLAMUggaTTAjS',
        'user_id': 3009.0,
        'user_agent': 'Chrome/105 (Windows)',
        'device_type': 'mobile',
        'ip_address': '255.135.228.144',
        'timestamp': pd.Timestamp('2025-09-07 12:07:59.514928'),
        'page_name': 'search',
        'subscriber_id': 105.0,
        'has_email_contact_permission': 'no',
        'has_phone_contact_permission': None,
        'hotel_price': '93,71',
        'hotel_id': 183,
        'currency': 'EUR',
        'country': 'GB',
        'utm_source': None,
        'search_query': 'query_56',
        'num_guests': 4.0,
        'destination_id': 1.0
    },
    {
        'request_id': 'CfqNJ2Ejg6LvRMKi',
        'funnel_id': 'afe5v8jbLSw6',
        'session_id': 'LE9SLAMUggaTTAjS',
        'user_id': 3009.0,
        'user_agent': 'Chrome/105 (Windows)',
        'device_type': 'mobile',
        'ip_address': '255.135.228.144',
        'timestamp': pd.Timestamp('2025-09-07 12:09:59.514928'),
        'page_name': 'detail_search',
        'subscriber_id': 105.0,
        'has_email_contact_permission': 'no',
        'has_phone_contact_permission': None,
        'hotel_price': '326,71',
        'hotel_id': 183,
        'currency': 'EUR',
        'country': 'GB',
        'utm_source': None,
        'search_query': None,
        'num_guests': None,
        'destination_id': None
    },
    {
        'request_id': 'nxC8LGnrNofisZwa',
        'funnel_id': 'afe5v8jbLSw6',
        'session_id': 'LE9SLAMUggaTTAjS',
        'user_id': 3009.0,
        'user_agent': 'Chrome/105 (Windows)',
        'device_type': 'mobile',
        'ip_address': '255.135.228.144',
        'timestamp': pd.Timestamp('2025-09-07 12:14:59.514928'),
        'page_name': 'reservation',
        'subscriber_id': 105.0,
        'has_email_contact_permission': 'no',
        'has_phone_contact_permission': None,
        'hotel_price': '326,71',
        'hotel_id': 183,
        'currency': 'EUR',
        'country': 'GB',
        'utm_source': None,
        'search_query': None,
        'num_guests': None,
        'destination_id': None
    }
]

df_raw = pd.DataFrame(sample_data)
print("Ham Veri:")
print(df_raw.head(), "\n")