# Data Collection and Integration

In [2]:
import pandas as pd
import numpy as np
import io
import re
import os
from google.cloud import storage
import time
from sklearn.model_selection import train_test_split

# 1 - Conexão com Google Cloud Storage
print("Setting up connection to Google Cloud Storage...")
client = storage.Client()
bucket = client.get_bucket("new_times")

# Listar e categorizar arquivos
blobs = list(bucket.list_blobs(prefix="raw_data/"))
file_paths = [blob.name for blob in blobs if blob.name.endswith(('.xlsx', '.xls', '.csv'))]
print(f"Found {len(file_paths)} files")

# 2 - Categorizar arquivos por tipo e lançamento
print("\nCategorizing files by type and launch...")
survey_files = []
buyer_files = []
utm_files = []
all_files_by_launch = {f"L{i}": [] for i in range(16, 22)}  # Para L16 até L21

# Função para extrair ID de lançamento
def extract_launch_id(filename):
    patterns = [
        r'L(\d+)[_\s\-]',  # L16_, L16-, L16 
        r'[_\s\-]L(\d+)',  # _L16, -L16, L16
        r'L(\d+)\.csv',    # L16.csv
        r'L(\d+)\.xls',    # L16.xls
        r'L(\d+)$'         # termina com L16
    ]
    
    for pattern in patterns:
        match = re.search(pattern, filename)
        if match:
            launch_num = match.group(1)
            return f"L{launch_num}"
    
    return None

# Categorizar arquivos
for file_path in file_paths:
    # Determinar o tipo de arquivo
    if any(keyword in file_path.lower() for keyword in ['pesquisa', 'survey', 'respuestas', 'ayudame']):
        survey_files.append(file_path)
    elif any(keyword in file_path.lower() for keyword in ['comprador', 'mario']):
        buyer_files.append(file_path)
    elif any(keyword in file_path.lower() for keyword in ['utm']):
        utm_files.append(file_path)
    
    # Identificar o lançamento
    launch_id = extract_launch_id(file_path)
    if launch_id and launch_id in all_files_by_launch:
        all_files_by_launch[launch_id].append(file_path)

# Mostrar informações de categorização
print(f"Survey files: {len(survey_files)}")
print(f"Buyer files: {len(buyer_files)}")
print(f"UTM files: {len(utm_files)}")

for launch_id, files in all_files_by_launch.items():
    if files:
        print(f"{launch_id} files: {len(files)}")

# 3 - Funções para normalizar emails
def normalize_email(email):
    if pd.isna(email) or email is None:
        return None
    
    email_str = str(email).lower().strip().replace(" ", "")
    if '@' not in email_str:
        return None
    
    username, domain = email_str.split('@', 1)
    
    # Gmail-specific normalization
    if domain == 'gmail.com':
        username = username.replace('.', '')
    
    # Common domain corrections
    domain_corrections = {
        'gmial.com': 'gmail.com', 'gmail.con': 'gmail.com', 'hotmial.com': 'hotmail.com',
        'outlook.con': 'outlook.com', 'yahoo.con': 'yahoo.com'
    }
    
    if domain in domain_corrections:
        domain = domain_corrections[domain]
    
    return f"{username}@{domain}"

# 4 - Ler e processar dados
survey_dfs = []
buyer_dfs = []
utm_dfs = []
launch_data = {launch_id: {'survey': None, 'buyer': None, 'utm': None} for launch_id in all_files_by_launch.keys()}

# Processar arquivos de pesquisa
print("\nLoading survey files...")
for file_path in survey_files:
    try:
        blob = bucket.blob(file_path)
        content = blob.download_as_bytes()
        
        # Identificar o lançamento deste arquivo
        launch_id = extract_launch_id(file_path)
        
        if file_path.endswith('.csv'):
            df = pd.read_csv(io.BytesIO(content))
        else:  # Excel
            df = pd.read_excel(io.BytesIO(content))
            
        # Encontrar coluna de email
        email_cols = [col for col in df.columns if any(pattern in col.lower() for pattern in 
                     ['email', 'e-mail', 'correo', '@', 'mail'])]
        
        if email_cols:
            df = df.rename(columns={email_cols[0]: 'email'})
        
        # Adicionar identificador de lançamento se pudermos determiná-lo
        if launch_id:
            df['lançamento'] = launch_id
            # Armazenar DataFrame pelo lançamento
            launch_data[launch_id]['survey'] = df
        
        survey_dfs.append(df)
        launch_info = f" ({launch_id})" if launch_id else ""
        print(f"  - Loaded: {file_path}{launch_info}, {df.shape[0]} rows, {df.shape[1]} columns")
    except Exception as e:
        print(f"  - Error loading {file_path}: {str(e)}")

# Processar arquivos de compradores
print("\nLoading buyer files...")
for file_path in buyer_files:
    try:
        blob = bucket.blob(file_path)
        content = blob.download_as_bytes()
        
        # Identificar o lançamento deste arquivo
        launch_id = extract_launch_id(file_path)
        
        if file_path.endswith('.csv'):
            df = pd.read_csv(io.BytesIO(content))
        else:  # Excel
            df = pd.read_excel(io.BytesIO(content), engine='openpyxl')
            
        # Encontrar coluna de email
        email_cols = [col for col in df.columns if any(pattern in col.lower() for pattern in 
                     ['email', 'e-mail', 'correo', '@', 'mail'])]
        
        if email_cols:
            df = df.rename(columns={email_cols[0]: 'email'})
        else:
            print(f"  - Warning: No email column found in {file_path}. Available columns: {', '.join(df.columns[:5])}...")
        
        # Adicionar identificador de lançamento se pudermos determiná-lo
        if launch_id:
            df['lançamento'] = launch_id
            # Armazenar DataFrame pelo lançamento
            launch_data[launch_id]['buyer'] = df
        
        buyer_dfs.append(df)
        launch_info = f" ({launch_id})" if launch_id else ""
        print(f"  - Loaded: {file_path}{launch_info}, {df.shape[0]} rows, {df.shape[1]} columns")
    except Exception as e:
        print(f"  - Error loading {file_path}: {str(e)}")

# 5 - Processar arquivos UTM com detecção automática de delimitador
print("\nLoading UTM files...")
for file_path in utm_files:
    try:
        blob = bucket.blob(file_path)
        content = blob.download_as_bytes()
        
        # Identificar o lançamento deste arquivo
        launch_id = extract_launch_id(file_path)
        
        # 1. Detectar o delimitador do arquivo
        try:
            content_str = content.decode('utf-8')
            test_lines = content_str.split('\n')[:10]  # Primeiras 10 linhas
            
            # Contar ocorrências de delimitadores potenciais na primeira linha
            comma_count = test_lines[0].count(',')
            semicolon_count = test_lines[0].count(';')
            
            # Usar o delimitador que aparece mais vezes
            delimiter = ';' if semicolon_count > comma_count else ','
            print(f"  - Detected delimiter '{delimiter}' for {file_path}")
                
            # 2. Processar o arquivo com o delimitador correto
            df = pd.read_csv(
                io.BytesIO(content),
                sep=delimiter,
                encoding='utf-8',
                on_bad_lines='skip',  # Pular linhas problemáticas
                low_memory=False      # Evitar avisos de tipos mistos
            )
            
            # 3. Verificação pós-carregamento - Se só temos 1-2 colunas, algo deu errado
            if df.shape[1] <= 2:
                print(f"  - Warning: Only {df.shape[1]} columns detected. Trying alternative delimiter...")
                
                # Tentar o delimitador oposto
                alt_delimiter = ';' if delimiter == ',' else ','
                df = pd.read_csv(
                    io.BytesIO(content),
                    sep=alt_delimiter,
                    encoding='utf-8',
                    on_bad_lines='skip',
                    low_memory=False
                )
                
                print(f"  - After using delimiter '{alt_delimiter}': {df.shape[1]} columns detected")
            
        except UnicodeDecodeError:
            # Tentar outra codificação se utf-8 falhar
            print(f"  - Unicode error. Trying latin-1 encoding for {file_path}")
            content_str = content.decode('latin-1')
            test_lines = content_str.split('\n')[:10]
            
            # Detectar delimitador novamente
            comma_count = test_lines[0].count(',')
            semicolon_count = test_lines[0].count(';')
            delimiter = ';' if semicolon_count > comma_count else ','
            
            df = pd.read_csv(
                io.BytesIO(content),
                sep=delimiter,
                encoding='latin-1',
                on_bad_lines='skip',
                low_memory=False
            )
        
        # Encontrar coluna de email
        email_cols = [col for col in df.columns if any(pattern in col.lower() for pattern in 
                     ['email', 'e-mail', 'correo', '@', 'mail'])]
        
        if email_cols:
            df = df.rename(columns={email_cols[0]: 'email'})
        else:
            print(f"  - Warning: No email column found in {file_path}. Available columns: {', '.join(df.columns[:5])}...")
        
        # Adicionar identificador de lançamento se pudermos determiná-lo
        if launch_id:
            df['lançamento'] = launch_id
            # Armazenar DataFrame pelo lançamento
            launch_data[launch_id]['utm'] = df
        
        utm_dfs.append(df)
        launch_info = f" ({launch_id})" if launch_id else ""
        print(f"  - Loaded: {file_path}{launch_info}, {df.shape[0]} rows, {df.shape[1]} columns")
    except Exception as e:
        print(f"  - Error loading {file_path}: {str(e)}")

# 6 - Combinar conjuntos de dados
print("\nCombining datasets...")
surveys = pd.concat(survey_dfs, ignore_index=True) if survey_dfs else pd.DataFrame()
buyers = pd.concat(buyer_dfs, ignore_index=True) if buyer_dfs else pd.DataFrame()
utms = pd.concat(utm_dfs, ignore_index=True) if utm_dfs else pd.DataFrame()

print(f"Survey data: {surveys.shape[0]} rows, {surveys.shape[1]} columns")
print(f"Buyer data: {buyers.shape[0]} rows, {buyers.shape[1]} columns")
print(f"UTM data: {utms.shape[0]} rows, {utms.shape[1]} columns")

# 7 - Documentar estrutura de dados
print("\nDocumenting data structure...")
for name, df in {"Surveys": surveys, "Buyers": buyers, "UTMs": utms}.items():
    if not df.empty:
        print(f"\n{name} structure:")
        print(f"  - Columns: {df.shape[1]}")
        print(f"  - Data types: {df.dtypes.value_counts().to_dict()}")
        print(f"  - Sample columns: {', '.join(df.columns[:5])}")

# 8 - Normalizar emails
print("\nNormalizing email addresses...")
surveys['email_norm'] = surveys['email'].apply(normalize_email)

# Verificar compradores
if not buyers.empty and 'email' in buyers.columns:
    buyers['email_norm'] = buyers['email'].apply(normalize_email)
else:
    print("Warning: Cannot normalize emails in buyers dataframe - empty or missing email column")
    if buyers.empty:
        buyers = pd.DataFrame(columns=['email', 'email_norm'])

# Verificar UTMs
if not utms.empty and 'email' in utms.columns:
    utms['email_norm'] = utms['email'].apply(normalize_email)
else:
    print("Warning: Cannot normalize emails in UTM dataframe - empty or missing email column")

# 9 - Correspondência de pesquisas com dados de compradores
print("\nMatching surveys with buyers...")
start_time = time.time()

# Verificar se podemos prosseguir com a correspondência
if surveys.empty or buyers.empty or 'email_norm' not in buyers.columns:
    print("Warning: Cannot perform matching. Either surveys or buyers data is empty or missing email_norm column.")
    matches_df = pd.DataFrame(columns=['buyer_id', 'survey_id', 'match_type', 'score'])
else:
    # Dicionários de consulta para correspondência mais rápida
    survey_emails_dict = dict(zip(surveys['email_norm'], surveys.index))
    survey_emails_set = set(surveys['email_norm'].dropna())

    # Correspondência exata
    matches = []
    buyers_with_exact_match = buyers[buyers['email_norm'].isin(survey_emails_set)]
    for idx, buyer in buyers_with_exact_match.iterrows():
        if pd.isna(buyer['email_norm']):
            continue
            
        survey_idx = survey_emails_dict.get(buyer['email_norm'])
        if survey_idx is not None:
            match_data = {
                'buyer_id': idx,
                'survey_id': survey_idx,
                'match_type': 'exact',
                'score': 1.0
            }
            
            # Adicionar informação de lançamento se disponível
            if 'lançamento' in buyer and not pd.isna(buyer['lançamento']):
                match_data['lançamento'] = buyer['lançamento']
                
            matches.append(match_data)

    matches_df = pd.DataFrame(matches)
    print(f"Total matches found: {len(matches_df)}")

# 10 - Criar variável alvo
print("\nCreating target variable...")
surveys['target'] = 0

if not matches_df.empty and 'survey_id' in matches_df.columns:
    match_survey_ids = set(matches_df['survey_id'])
    surveys.loc[surveys.index.isin(match_survey_ids), 'target'] = 1
    
    # Adicionar informação de lançamento às surveys com base nas correspondências
    if 'lançamento' in matches_df.columns:
        if 'lançamento' not in surveys.columns:
            surveys['lançamento'] = None
        
        # Iterar pelas correspondências para atribuir o lançamento correto
        for _, match in matches_df.iterrows():
            if 'lançamento' in match and not pd.isna(match['lançamento']):
                surveys.loc[match['survey_id'], 'lançamento'] = match['lançamento']
    
    conversion_rate = surveys['target'].mean() * 100
    print(f"Conversion rate: {conversion_rate:.2f}%")
else:
    print("No matches found - target variable will be all zeros")

# 11 - Mesclar conjuntos de dados
merged_data = surveys
if not utms.empty and 'email_norm' in utms.columns:
    merged_data = pd.merge(
        surveys,
        utms,
        on='email_norm',
        how='left',
        suffixes=('', '_utm')
    )
    print(f"Merged survey data with UTM data")

# 12 - Adicionar informação de lançamento se ainda não foi adicionada
if 'lançamento' not in merged_data.columns and 'lançamento' in buyers.columns:
    # Criar um dicionário mapeando email_norm para lançamento
    email_to_launch = dict(zip(buyers['email_norm'], buyers['lançamento']))
    
    # Adicionar coluna de lançamento ao DataFrame mesclado
    merged_data['lançamento'] = merged_data['email_norm'].map(email_to_launch)
    print("Added launch information from buyer data")

print(f"\nFinal merged dataset: {merged_data.shape[0]} rows, {merged_data.shape[1]} columns")

# 13 - Estatísticas de lançamento
if 'lançamento' in merged_data.columns:
    launch_counts = merged_data['lançamento'].value_counts(dropna=False)
    print("\nRegistros por lançamento:")
    for launch, count in launch_counts.items():
        launch_str = "Sem lançamento identificado" if pd.isna(launch) else launch
        print(f"  - {launch_str}: {count} registros")

# 14 - Criar diretório para salvar os dados divididos
data_dir = "../data/split"
os.makedirs(data_dir, exist_ok=True)

# 15 - Dividir os dados em treino, validação e teste
print("\nSplitting data into train, validation, and test sets...")

# Verificar se temos dados suficientes para divisão
if merged_data.shape[0] < 100:
    print("Warning: Dataset too small for splitting. Saving all data as train set.")
    merged_data.to_csv(f"{data_dir}/train.csv", index=False)
else:
    # Primeiro separar o conjunto de teste (20% dos dados)
    train_val, test = train_test_split(
        merged_data, 
        test_size=0.15, 
        random_state=42,
        stratify=merged_data['target'] if 'target' in merged_data.columns else None
    )
    
    # Depois separar entre treino (64% do total) e validação (16% do total)
    train, val = train_test_split(
        train_val, 
        test_size=0.15, 
        random_state=42,
        stratify=train_val['target'] if 'target' in train_val.columns else None
    )
    
    # Salvar os conjuntos de dados
    train.to_csv(f"{data_dir}/train.csv", index=False)
    val.to_csv(f"{data_dir}/validation.csv", index=False)
    test.to_csv(f"{data_dir}/test.csv", index=False)
    
    print(f"Train set: {train.shape[0]} rows, {train.shape[1]} columns")
    print(f"Validation set: {val.shape[0]} rows, {val.shape[1]} columns")
    print(f"Test set: {test.shape[0]} rows, {test.shape[1]} columns")
    
    # Verificar distribuição da variável alvo
    if 'target' in merged_data.columns:
        print(f"\nTarget distribution:")
        print(f"  - Train: {train['target'].mean()*100:.2f}% positive")
        print(f"  - Validation: {val['target'].mean()*100:.2f}% positive")
        print(f"  - Test: {test['target'].mean()*100:.2f}% positive")

Setting up connection to Google Cloud Storage...
Found 18 files

Categorizing files by type and launch...
Survey files: 6
Buyer files: 6
UTM files: 6
L16 files: 3
L17 files: 3
L18 files: 3
L19 files: 3
L20 files: 3
L21 files: 3

Loading survey files...
  - Loaded: raw_data/Pesquisa_L16.xlsx (L16), 26732 rows, 20 columns
  - Loaded: raw_data/Pesquisa_L17.xlsx (L17), 21020 rows, 21 columns
  - Loaded: raw_data/Pesquisa_L18.xlsx (L18), 15610 rows, 21 columns
  - Loaded: raw_data/Pesquisa_L19.xlsx (L19), 15156 rows, 21 columns
  - Loaded: raw_data/Pesquisa_L20.xlsx (L20), 17257 rows, 21 columns
  - Loaded: raw_data/Pesquisa_L21.xlsx (L21), 28581 rows, 21 columns

Loading buyer files...
  - Loaded: raw_data/compradores_mario_L16.csv (L16), 756 rows, 61 columns
  - Loaded: raw_data/compradores_mario_L17.csv (L17), 796 rows, 61 columns
  - Loaded: raw_data/compradores_mario_L18.csv (L18), 463 rows, 61 columns
  - Loaded: raw_data/compradores_mario_L19.csv (L19), 436 rows, 61 columns
  - Loade

## 2. Exploratory Data Analysis

### i. Data Quality Assessment
1. Check dataset sizes and completeness  
2. Count null values and missing data  
3. Identify duplicates  
4. List *unique* values and their percentages  
5. Flag extremely low/unusual values  

### ii. Univariate Analysis
1. Analyze distributions for each variable  
2. Identify outliers  

### iii. Relationship Analysis
1. Correlation analysis between features  
2. Feature-target relationships  
3. Initial feature importance assessment  

### iv. Segment Analysis
1. Country-specific patterns  
2. UTM source/campaign analysis  
3. Temporal patterns (day, time)  
4. Time-to-conversion metrics  
5. Launch/cohort analysis (L16–L21)  
6. Leads that buy in other launches  

### v. Check for category shifts over time


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import os
import warnings
import sys
from io import StringIO
!pip install -U kaleido
warnings.filterwarnings('ignore')

# Configure better plots
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Preparação para capturar a saída em um arquivo
old_stdout = sys.stdout
report_content = StringIO()
sys.stdout = report_content

# Load the merged dataset
print("Loading merged dataset...")
try:
    # Try to load the data with launch information from the previous step
    merged_data = pd.read_csv('01_data_collection_and_integration.csv')
    print(f"Loaded dataset with launch information: {merged_data.shape[0]} rows, {merged_data.shape[1]} columns")
except:
    try:
        # Fallback to original processed data
        merged_data = pd.read_csv('smart_ads_processed_data.csv')
        print(f"Loaded processed dataset: {merged_data.shape[0]} rows, {merged_data.shape[1]} columns")
    except:
        print("ERROR: No processed dataset found. Please run the data collection and integration step first.")
        # For demonstration, we'll create a dummy message
        print("Creating dummy dataset for demonstration...")
        merged_data = pd.DataFrame()

if not merged_data.empty:
    # Create a results directory if it doesn't exist
    os.makedirs('eda_results', exist_ok=True)
    
    print("\n=============================================")
    print("1. DATA QUALITY ASSESSMENT")
    print("=============================================\n")
    
    # Dataset sizes and completeness
    print("Dataset Overview:")
    print(f"Total records: {len(merged_data)}")
    print(f"Total features: {merged_data.shape[1]}")
    
    # Data types
    print("\nData types:")
    dtype_counts = merged_data.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"  {dtype}: {count} columns")
    
    # Completion Rate
    completion_rates = (merged_data.count() / len(merged_data) * 100).sort_values()
    print("\nCompletion rates (lowest 10):")
    for col, rate in completion_rates.head(10).items():
        print(f"  {col}: {rate:.2f}%")
    
    # Missing values analysis
    missing_vals = merged_data.isnull().sum()
    missing_vals = missing_vals[missing_vals > 0].sort_values(ascending=False)
    
    print(f"\nColumns with missing values: {len(missing_vals)} out of {len(merged_data.columns)}")
    if len(missing_vals) > 0:
        print("Top 10 columns with most missing values:")
        for col, count in missing_vals.head(10).items():
            print(f"  {col}: {count} missing ({count/len(merged_data)*100:.2f}%)")
            
        # Plot missing values
        plt.figure(figsize=(10, 6))
        ax = missing_vals.head(15).plot(kind='bar')
        plt.title('Missing Values by Column (Top 15)')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig('eda_results/missing_values.png')
    
    # Check for duplicates in ID columns only
    id_cols = [col for col in merged_data.columns if ('id' in col.lower() or 'email' in col.lower()) 
               and not any(text in col.lower() for text in ['message', 'mensaje', 'fluid', 'opportunities', 'learn'])]
    
    if id_cols:
        print("\nChecking for duplicates in ID columns:")
        for col in id_cols:
            if col in merged_data.columns:
                unique_count = merged_data[col].nunique()
                duplicate_pct = (1 - unique_count / merged_data[col].count()) * 100
                print(f"  {col}: {unique_count} unique values, {duplicate_pct:.2f}% duplicates")
    
    # Full duplicate rows
    duplicate_rows = merged_data.duplicated().sum()
    print(f"\nFull duplicate rows: {duplicate_rows} ({duplicate_rows/len(merged_data)*100:.2f}%)")
    
    # Identify free text columns
    free_text_cols = [col for col in merged_data.columns if any(pattern in col.lower() for pattern in 
                    ['mensaje', 'message', 'cambiar', 'esperas', 'oportunidades', 'fluido', 'fluidez'])]
    
    if free_text_cols:
        print("\nFree text columns identified (excluded from duplicates analysis):")
        for col in free_text_cols:
            # Show basic statistics for text fields
            not_null = merged_data[col].notna().sum()
            avg_length = merged_data[col].astype(str).str.len().mean()
            print(f"  {col}: {not_null} non-null values, avg length: {avg_length:.1f} characters")
    
    # Unique values analysis for categorical variables
    categorical_cols = [col for col in merged_data.select_dtypes(include=['object']).columns 
                      if col not in free_text_cols]
    
    print("\nUnique value analysis for categorical columns:")
    high_cardinality_cols = []
    
    for col in categorical_cols:
        unique_vals = merged_data[col].nunique()
        not_null_count = merged_data[col].count()
        unique_pct = unique_vals / not_null_count * 100 if not_null_count > 0 else 0
        
        # Detect high cardinality columns (excluding free text fields)
        if unique_pct > 50 and unique_vals > 100 and col not in free_text_cols:
            high_cardinality_cols.append((col, unique_vals, unique_pct))
        
        # Print details for columns with reasonable cardinality
        if unique_vals < 50:
            print(f"\n  {col}: {unique_vals} unique values")
            # Show value distribution
            val_counts = merged_data[col].value_counts(normalize=True, dropna=False).head(5)
            for val, pct in val_counts.items():
                val_display = str(val)[:30] + '...' if isinstance(val, str) and len(str(val)) > 30 else val
                print(f"    - {val_display}: {pct*100:.2f}%")
    
    # Report high cardinality columns (excluding known free text fields)
    if high_cardinality_cols:
        print("\nHigh cardinality columns (likely IDs or unique identifiers):")
        for col, unique_vals, unique_pct in sorted(high_cardinality_cols, key=lambda x: x[1], reverse=True):
            print(f"  {col}: {unique_vals} unique values ({unique_pct:.2f}% unique)")
    
    # Identify extremely low/unusual values in numeric columns
    numeric_cols = merged_data.select_dtypes(include=['number']).columns
    
    print("\nChecking for unusual values in numeric columns:")
    for col in numeric_cols:
        # Skip likely ID columns
        if 'id' in col.lower() or col.endswith('_id'):
            continue
            
        # Get basic stats
        col_min = merged_data[col].min()
        col_max = merged_data[col].max()
        col_mean = merged_data[col].mean()
        zeros_count = (merged_data[col] == 0).sum()
        zeros_pct = zeros_count / merged_data[col].count() * 100
        
        # Check for unusual patterns
        if zeros_pct > 90:
            print(f"  {col}: {zeros_pct:.2f}% zeros (potential issue)")
        elif col_min < 0 and col_max > 0 and abs(col_min) > col_max * 10:
            print(f"  {col}: Extreme negative value ({col_min}) compared to max ({col_max})")
        elif col_max > col_mean * 100:
            print(f"  {col}: Extreme positive value ({col_max}) compared to mean ({col_mean:.2f})")
    
    # Target variable analysis
    if 'target' in merged_data.columns:
        target_counts = merged_data['target'].value_counts()
        target_pct = target_counts / target_counts.sum() * 100
        
        print("\nTarget variable distribution:")
        for val, count in target_counts.items():
            print(f"  {val}: {count} records ({target_pct[val]:.2f}%)")
        
        print("Note: Low conversion rate (~1.5%) is expected for this type of digital campaign data")
        
        # Plot target distribution
        plt.figure(figsize=(8, 5))
        sns.countplot(x='target', data=merged_data)
        plt.title('Target Variable Distribution')
        plt.xlabel('Target (0=No Conversion, 1=Conversion)')
        plt.ylabel('Count')
        plt.savefig('eda_results/target_distribution.png')
        
    print("\n=============================================")
    print("2. UNIVARIATE ANALYSIS")
    print("=============================================\n")
    
    # Numeric variable distributions
    numeric_cols = merged_data.select_dtypes(include=['number']).columns
    print(f"Analyzing distributions for {len(numeric_cols)} numeric variables...")
    
    # Exclude ID columns and the target
    analysis_cols = [col for col in numeric_cols if not ('id' in col.lower() or col == 'target')]
    
    # Create histograms for numeric variables
    if analysis_cols:
        print("Creating distribution plots for numeric variables...")
        for i, col in enumerate(analysis_cols[:10]):  # Limit to first 10 for brevity
            plt.figure(figsize=(10, 6))
            sns.histplot(merged_data[col].dropna(), kde=True)
            plt.title(f'Distribution of {col}')
            plt.tight_layout()
            plt.savefig(f'eda_results/dist_{col}.png')
            
            # Print basic statistics
            stats = merged_data[col].describe()
            print(f"\n{col} statistics:")
            print(f"  Mean: {stats['mean']:.2f}")
            print(f"  Std: {stats['std']:.2f}")
            print(f"  Min: {stats['min']:.2f}")
            print(f"  25%: {stats['25%']:.2f}")
            print(f"  Median: {stats['50%']:.2f}")
            print(f"  75%: {stats['75%']:.2f}")
            print(f"  Max: {stats['max']:.2f}")
            
            # Detect outliers using IQR method
            Q1 = stats['25%']
            Q3 = stats['75%']
            IQR = Q3 - Q1
            outlier_low = Q1 - 1.5 * IQR
            outlier_high = Q3 + 1.5 * IQR
            outliers = merged_data[(merged_data[col] < outlier_low) | (merged_data[col] > outlier_high)][col]
            print(f"  Outliers: {len(outliers)} ({len(outliers)/merged_data[col].count()*100:.2f}%)")
    
    # Categorical variable distributions
    cat_cols = [col for col in categorical_cols if merged_data[col].nunique() < 30]  # Exclude high cardinality
    
    if cat_cols:
        print("\nAnalyzing distributions for categorical variables...")
        for col in cat_cols[:5]:  # Limit to first 5 for brevity
            plt.figure(figsize=(12, 6))
            val_counts = merged_data[col].value_counts().head(15)  # Top 15 categories
            ax = val_counts.plot(kind='bar')
            plt.title(f'Distribution of {col} (Top 15 values)')
            plt.tight_layout()
            plt.xticks(rotation=45, ha='right')
            plt.savefig(f'eda_results/cat_dist_{col}.png')
            
            # Print distribution statistics
            print(f"\n{col} - top 5 values:")
            top_vals = merged_data[col].value_counts(normalize=True).head(5)
            for val, pct in top_vals.items():
                val_display = str(val)[:30] + '...' if isinstance(val, str) and len(str(val)) > 30 else val
                print(f"  {val_display}: {pct*100:.2f}%")
    
    print("\n=============================================")
    print("3. RELATIONSHIP ANALYSIS")
    print("=============================================\n")
    
    # Target-related analysis (if target exists)
    if 'target' in merged_data.columns:
        print("Analyzing relationships with target variable...")
        
        # Numeric features vs target
        numeric_features = [col for col in numeric_cols if col != 'target' and not ('id' in col.lower())]
        
        if numeric_features:
            # Calculate correlation with target
            target_corr = merged_data[numeric_features + ['target']].corr()['target'].sort_values(ascending=False)
            
            print("\nCorrelation of numeric features with target:")
            for feature, corr in target_corr.items():
                if feature != 'target':
                    print(f"  {feature}: {corr:.4f}")
            
            # Plot top correlations with target
            plt.figure(figsize=(10, 8))
            top_corrs = target_corr[1:11]  # Skip target itself, take top 10
            top_corrs.plot(kind='bar')
            plt.title('Top 10 Correlations with Target')
            plt.tight_layout()
            plt.savefig('eda_results/target_correlation.png')
        
        # Categorical features vs target
        cat_features_for_analysis = [col for col in categorical_cols 
                                  if merged_data[col].nunique() < 20 and merged_data[col].nunique() > 1]
        
        if cat_features_for_analysis:
            print("\nCategorical features vs target:")
            for col in cat_features_for_analysis[:5]:  # Limit to first 5
                cross_tab = pd.crosstab(
                    merged_data[col], 
                    merged_data['target'], 
                    normalize='index'
                )
                
                # Print conversion rate by category
                if 1 in cross_tab.columns:  # If there are positive conversions
                    print(f"\n  {col} - conversion rates:")
                    conversion_rates = cross_tab[1].sort_values(ascending=False)
                    for cat, rate in conversion_rates.head(5).items():
                        cat_display = str(cat)[:30] + '...' if isinstance(cat, str) and len(str(cat)) > 30 else cat
                        print(f"    {cat_display}: {rate*100:.2f}%")
                
                # Plot categorical feature vs target
                plt.figure(figsize=(12, 6))
                pd.crosstab(merged_data[col], merged_data['target']).plot(kind='bar', stacked=True)
                plt.title(f'{col} vs Target')
                plt.ylabel('Count')
                plt.xticks(rotation=45, ha='right')
                plt.tight_layout()
                plt.savefig(f'eda_results/{col}_vs_target.png')
        
        # Feature importance using Random Forest
        print("\nCalculating feature importance using Random Forest...")
        
        # Prepare data for modeling
        features_for_rf = []
        
        # Add numeric features
        for col in numeric_features:
            if merged_data[col].isnull().sum() < len(merged_data) * 0.3:  # Less than 30% missing
                features_for_rf.append(col)
        
        # Add encoded categorical features (low cardinality)
        cat_for_rf = [col for col in categorical_cols 
                    if merged_data[col].nunique() < 20 and merged_data[col].nunique() > 1 
                    and merged_data[col].isnull().sum() < len(merged_data) * 0.3]
        
        # Encode categorical features
        X_rf = merged_data[features_for_rf].copy()
        encoders = {}
        
        for col in cat_for_rf:
            le = LabelEncoder()
            # Fit only on non-null values
            valid_idx = ~merged_data[col].isnull()
            X_rf[col] = pd.Series(index=merged_data.index)
            X_rf.loc[valid_idx, col] = le.fit_transform(merged_data.loc[valid_idx, col])
            encoders[col] = le
            features_for_rf.append(col)
        
        # Fill missing values with median/mode
        for col in features_for_rf:
            if X_rf[col].isnull().any():
                if X_rf[col].dtype.kind in 'ifc':  # numeric
                    X_rf[col].fillna(X_rf[col].median(), inplace=True)
                else:
                    X_rf[col].fillna(X_rf[col].mode()[0], inplace=True)
        
        # Train a simple RF model to get feature importance
        if len(features_for_rf) > 0 and 'target' in merged_data.columns:
            try:
                rf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
                rf.fit(X_rf[features_for_rf], merged_data['target'])
                
                # Get feature importances
                importances = pd.Series(rf.feature_importances_, index=features_for_rf)
                top_features = importances.sort_values(ascending=False)
                
                print("\nTop 10 important features:")
                for feature, importance in top_features.head(10).items():
                    print(f"  {feature}: {importance:.4f}")
                
                # Plot feature importances
                plt.figure(figsize=(12, 8))
                top_features.head(15).plot(kind='bar')
                plt.title('Feature Importance')
                plt.tight_layout()
                plt.savefig('eda_results/feature_importance.png')
            except Exception as e:
                print(f"Error calculating feature importance: {e}")
    
    # Correlation matrix for numeric features
    corr_features = [col for col in numeric_cols if not ('id' in col.lower())]
    if len(corr_features) > 1:
        print("\nCalculating correlation matrix for numeric features...")
        corr_matrix = merged_data[corr_features].corr()
        
        # Plot correlation matrix
        plt.figure(figsize=(14, 10))
        sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
        plt.title('Correlation Matrix')
        plt.tight_layout()
        plt.savefig('eda_results/correlation_matrix.png')
        
        # Find highly correlated features
        high_corr_threshold = 0.7
        high_corr_pairs = []
        
        for i in range(len(corr_features)):
            for j in range(i+1, len(corr_features)):
                if abs(corr_matrix.iloc[i, j]) > high_corr_threshold:
                    high_corr_pairs.append((
                        corr_features[i], 
                        corr_features[j], 
                        corr_matrix.iloc[i, j]
                    ))
        
        if high_corr_pairs:
            print("\nHighly correlated feature pairs:")
            for feat1, feat2, corr in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True):
                print(f"  {feat1} — {feat2}: {corr:.4f}")
    
    print("\n=============================================")
    print("4. SEGMENT ANALYSIS")
    print("=============================================\n")
    
    # Country-specific patterns
    country_col = next((col for col in merged_data.columns 
                      if any(pattern in col.lower() for pattern in ['pais', 'país', 'country'])), None)
    
    if country_col and 'target' in merged_data.columns:
        print("Analyzing country-specific patterns...")
        
        # Display raw counts by country
        country_counts = merged_data[country_col].value_counts()
        print("\nDistribution of records by country:")
        for country, count in country_counts.head(10).items():
            print(f"  {country}: {count} records ({count/len(merged_data)*100:.2f}%)")
        
        # Conversion rate by country
        country_conversion = merged_data.groupby(country_col)['target'].agg(['sum', 'count', 'mean'])
        country_conversion['conversion_rate'] = country_conversion['mean'] * 100
        country_conversion = country_conversion.sort_values('conversion_rate', ascending=False)
        
        # Filter to countries with at least 100 records
        country_conversion_filtered = country_conversion[country_conversion['count'] >= 100]
        
        print("\nConversion rates by country (min 100 records):")
        for country, data in country_conversion_filtered.iterrows():
            print(f"  {country}: {data['conversion_rate']:.2f}% ({data['sum']} conversions out of {data['count']} records)")
        
        # Plot conversion rate by country
        plt.figure(figsize=(12, 8))
        country_conversion_filtered['conversion_rate'].plot(kind='bar')
        plt.title('Conversion Rate by Country')
        plt.ylabel('Conversion Rate (%)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig('eda_results/country_conversion.png')
        
        # Double check Spain and USA specifically
        print("\nDetailed analysis for Spain and USA:")
        for country in ['España', 'Spain', 'Estados Unidos', 'USA', 'United States']:
            if country in merged_data[country_col].values:
                conversions = merged_data[merged_data[country_col] == country]['target'].sum()
                total = merged_data[merged_data[country_col] == country].shape[0]
                rate = conversions / total * 100 if total > 0 else 0
                print(f"  {country}: {conversions} conversions out of {total} records ({rate:.2f}%)")
    
    # UTM source/campaign analysis
    utm_cols = [col for col in merged_data.columns 
               if any(pattern in col.lower() for pattern in ['utm', 'source', 'campaign', 'medium'])]
    
    if utm_cols and 'target' in merged_data.columns:
        print("\nAnalyzing UTM parameters...")
    
    for utm_col in utm_cols:
        if merged_data[utm_col].nunique() < 100:  # Skip if too many unique values
            try:
                # Calculate conversion rate by UTM value
                # Mudando a forma de criar o DataFrame agregado
                utm_agg = merged_data.groupby(utm_col)['target'].agg(['sum', 'count'])
                utm_agg['mean'] = utm_agg['sum'] / utm_agg['count']
                utm_agg['conversion_rate'] = utm_agg['mean'] * 100
                
                # Filter to values with at least 50 records
                utm_filtered = utm_agg[utm_agg['count'] >= 50]
                utm_filtered = utm_filtered.sort_values('conversion_rate', ascending=False)
                
                # Plot top UTM values by conversion rate
                plt.figure(figsize=(14, 8))
                utm_filtered['conversion_rate'].head(10).plot(kind='bar')
                plt.title(f'Conversion Rate by {utm_col} (Top 10)')
                plt.ylabel('Conversion Rate (%)')
                plt.xticks(rotation=45, ha='right')
                plt.tight_layout()
                plt.savefig(f'eda_results/{utm_col}_conversion.png')
            except Exception as e:
                print(f"Error analyzing UTM column {utm_col}: {e}")
                
    # Alternativa caso a abordagem anterior falhe
    # Esta é uma abordagem mais robusta para processamento de UTMs
    def analyze_utm_safely(df, utm_col):
        if utm_col not in df.columns or df[utm_col].nunique() >= 100:
            return

        try:
            # Agrupar de forma mais segura
            grouped = df.groupby(utm_col)

            # Calcular estatísticas de forma explícita
            counts = grouped.size()
            converted = grouped['target'].sum()

            # Criar dataframe com as métricas
            result_df = pd.DataFrame({
                'count': counts,
                'conversions': converted
            })

            # Adicionar taxa de conversão
            result_df['conversion_rate'] = (result_df['conversions'] / result_df['count']) * 100

            # Filtrar por contagem mínima
            filtered = result_df[result_df['count'] >= 50]

            # Ordenar por taxa de conversão
            filtered = filtered.sort_values('conversion_rate', ascending=False)

            if not filtered.empty:
                print(f"\nConversion rates by {utm_col} (alternative method, min 50 records):")
                for utm_val, row in filtered.head(10).iterrows():
                    utm_display = str(utm_val)[:30] + '...' if isinstance(utm_val, str) and len(str(utm_val)) > 30 else utm_val
                    if pd.isna(utm_display):
                        utm_display = 'NaN'
                    print(f"  {utm_display}: {row['conversion_rate']:.2f}% ({row['conversions']} conversions out of {row['count']} records)")
        except Exception as e:
            print(f"Alternative UTM analysis for {utm_col} failed: {e}")
            
            if not utm_filtered.empty:
                print(f"\nConversion rates by {utm_col} (min 50 records):")
                for utm_val, row in utm_filtered.head(10).iterrows():
                    utm_display = str(utm_val)[:30] + '...' if isinstance(utm_val, str) and len(str(utm_val)) > 30 else utm_val
                    if pd.isna(utm_display):
                        utm_display = 'NaN'
                    print(f"  {utm_display}: {row['conversion_rate']:.2f}% ({row['sum']} conversions out of {row['count']} records)")
    
    # Temporal patterns analysis
    date_cols = [col for col in merged_data.columns 
                if any(pattern in col.lower() for pattern in ['date', 'time', 'fecha', 'data', 'timestamp', 'marca'])]
    
    if date_cols:
        print("\nAnalyzing temporal patterns...")
        
        # Process timestamp columns
        for date_col in date_cols:
            try:
                # Try to convert to datetime
                if merged_data[date_col].dtype == 'object':
                    merged_data[f'{date_col}_dt'] = pd.to_datetime(merged_data[date_col], errors='coerce')
                else:
                    # If already numeric, might be a timestamp
                    merged_data[f'{date_col}_dt'] = pd.to_datetime(merged_data[date_col], unit='s', errors='coerce')
                
                if merged_data[f'{date_col}_dt'].notna().any():
                    print(f"Successfully converted {date_col} to datetime")
                    
                    # Extract time components
                    merged_data[f'{date_col}_hour'] = merged_data[f'{date_col}_dt'].dt.hour
                    merged_data[f'{date_col}_day'] = merged_data[f'{date_col}_dt'].dt.day_name()
                    merged_data[f'{date_col}_month'] = merged_data[f'{date_col}_dt'].dt.month_name()
                    
                    # Analyze hourly pattern
                    if 'target' in merged_data.columns:
                        hour_conversion = merged_data.groupby(f'{date_col}_hour')['target'].agg(['sum', 'count', 'mean'])
                        hour_conversion['conversion_rate'] = hour_conversion['mean'] * 100
                        
                        print(f"\nHourly conversion rates from {date_col}:")
                        for hour in sorted(hour_conversion.index):
                            data = hour_conversion.loc[hour]
                            print(f"  Hour {hour}: {data['conversion_rate']:.2f}% ({data['sum']} conversions out of {data['count']} records)")
                        
                        # Plot hourly conversion rate
                        plt.figure(figsize=(12, 6))
                        hour_conversion['conversion_rate'].plot()
                        plt.title(f'Conversion Rate by Hour ({date_col})')
                        plt.xlabel('Hour of Day')
                        plt.ylabel('Conversion Rate (%)')
                        plt.xticks(range(0, 24, 2))
                        plt.grid(True)
                        plt.tight_layout()
                        plt.savefig(f'eda_results/{date_col}_hourly_conversion.png')
                        
                        # Analyze day of week pattern
                        day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
                        day_conversion = merged_data.groupby(f'{date_col}_day')['target'].agg(['sum', 'count', 'mean'])
                        day_conversion['conversion_rate'] = day_conversion['mean'] * 100
                        
                        # Reindex to get days in correct order
                        day_conversion = day_conversion.reindex(day_order)
                        
                        print(f"\nDay of week conversion rates from {date_col}:")
                        for day in day_order:
                            if day in day_conversion.index:
                                data = day_conversion.loc[day]
                                print(f"  {day}: {data['conversion_rate']:.2f}% ({data['sum']} conversions out of {data['count']} records)")
                        
                        # Plot day of week conversion rate
                        plt.figure(figsize=(12, 6))
                        day_conversion['conversion_rate'].plot(kind='bar')
                        plt.title(f'Conversion Rate by Day of Week ({date_col})')
                        plt.ylabel('Conversion Rate (%)')
                        plt.tight_layout()
                        plt.savefig(f'eda_results/{date_col}_daily_conversion.png')
            except Exception as e:
                print(f"Error processing date column {date_col}: {e}")
    
    # Enhanced Launch Analysis (L16-L21)
    print("\n=============================================")
    print("5. ENHANCED LAUNCH/COHORT ANALYSIS")
    print("=============================================\n")
    
    # Check if we have the launch column from previous processing
    launch_col = None
    if 'lançamento' in merged_data.columns:
        launch_col = 'lançamento'
    elif 'lancamento' in merged_data.columns:
        launch_col = 'lancamento'
    
    if launch_col is None:
        # Try to infer launch information from other columns
        # Look for patterns L16, L17, etc. in columns
        merged_data['launch_cohort'] = None
        launch_patterns = ['L16', 'L17', 'L18', 'L19', 'L20', 'L21']
        
        for pattern in launch_patterns:
            pattern_mask = False
            for col in merged_data.columns:
                if merged_data[col].dtype == 'object':
                    pattern_mask |= merged_data[col].astype(str).str.contains(pattern, case=False, regex=False).fillna(False)
            
            if pattern_mask.any():
                merged_data.loc[pattern_mask, 'launch_cohort'] = pattern
        
        launch_col = 'launch_cohort'
    
    # Ensure we have a valid launch column with data
    if launch_col in merged_data.columns and merged_data[launch_col].notna().any():
        print(f"Using launch column: {launch_col}")
        
        # Basic statistics by launch
        launch_stats = merged_data.groupby(launch_col).size().reset_index(name='records')
        launch_stats = launch_stats.sort_values(by=launch_col)
        
        print("\nRecord distribution by launch:")
        for _, row in launch_stats.iterrows():
            launch = row[launch_col]
            count = row['records']
            if pd.notna(launch):
                print(f"  {launch}: {count} records ({count/len(merged_data)*100:.2f}%)")
        
        # Plot launch distribution
        plt.figure(figsize=(10, 6))
        ax = sns.barplot(x=launch_col, y='records', data=launch_stats[launch_stats[launch_col].notna()])
        plt.title('Record Distribution by Launch')
        plt.ylabel('Number of Records')
        plt.tight_layout()
        plt.savefig('eda_results/launch_distribution.png')
        
        # Target conversion by launch
        if 'target' in merged_data.columns:
            launch_conversion = merged_data.groupby(launch_col)['target'].agg(['sum', 'count', 'mean'])
            launch_conversion['conversion_rate'] = launch_conversion['mean'] * 100
            
            print("\nConversion rates by launch:")
            for launch, data in launch_conversion.iterrows():
                if pd.notna(launch):
                    print(f"  {launch}: {data['conversion_rate']:.2f}% ({data['sum']} conversions out of {data['count']} records)")
            
            # Plot conversion rate by launch
            plt.figure(figsize=(10, 6))
            launch_conversion = launch_conversion.sort_index()
            launch_conversion['conversion_rate'].plot(kind='bar')
            plt.title('Conversion Rate by Launch')
            plt.ylabel('Conversion Rate (%)')
            plt.tight_layout()
            plt.savefig('eda_results/launch_conversion_rate.png')
        
        # Category shifts over time (across launches)
        print("\nAnalyzing category shifts across launches...")
        
        # Select categorical columns for trend analysis
        trend_cols = [col for col in categorical_cols if merged_data[col].nunique() > 1 and merged_data[col].nunique() < 20]
        
        for col in trend_cols[:5]:  # Limit to top 5 to avoid too many charts
            try:
                # Calculate value distribution by launch
                cat_by_launch = pd.crosstab(merged_data[launch_col], merged_data[col], normalize='index') * 100
                
                # Skip if too sparse
                if cat_by_launch.shape[1] <= 1 or cat_by_launch.dropna().empty:
                    continue
                    
                print(f"\nCategory shifts for {col} across launches:")
                
                # Print top categories for each launch
                for launch in cat_by_launch.index:
                    if pd.isna(launch):
                        continue
                    
                    top_cats = cat_by_launch.loc[launch].nlargest(5)
                    print(f"  {launch} top categories:")
                    for cat, pct in top_cats.items():
                        cat_display = str(cat)[:30] + '...' if isinstance(cat, str) and len(str(cat)) > 30 else cat
                        print(f"    - {cat_display}: {pct:.2f}%")
                
                # Plot trend of top categories
                plt.figure(figsize=(12, 8))
                for cat in cat_by_launch.columns[:5]:  # Top 5 categories
                    cat_by_launch[cat].plot(marker='o', label=str(cat)[:20])
                
                plt.title(f'Category Shift: {col} Distribution by Launch')
                plt.ylabel('Percentage (%)')
                plt.grid(True, linestyle='--', alpha=0.7)
                plt.legend(title='Categories')
                plt.tight_layout()
                plt.savefig(f'eda_results/category_shift_{col}.png')
            except Exception as e:
                print(f"Error analyzing category shift for {col}: {e}")
        
        # Profile differences between launches
        print("\nProfiling differences between launches...")
        
        numeric_by_launch = {}
        for col in numeric_cols:
            if col != 'target' and not ('id' in col.lower()):
                try:
                    # Calculate mean of numeric column by launch
                    means = merged_data.groupby(launch_col)[col].mean()
                    if not means.isna().all():
                        numeric_by_launch[col] = means
                except Exception as e:
                    pass
        
        if numeric_by_launch:
            numeric_trends = pd.DataFrame(numeric_by_launch)
            
            # Print significant shifts
            print("\nSignificant numeric shifts between launches:")
            for col in numeric_trends.columns:
                min_val = numeric_trends[col].min()
                max_val = numeric_trends[col].max()
                
                # Check if there's meaningful variation (avoid near-constant values)
                if max_val - min_val > 0.1 * min_val:
                    print(f"\n  {col}:")
                    for launch, val in numeric_trends[col].items():
                        if pd.notna(launch) and pd.notna(val):
                            print(f"    - {launch}: {val:.2f}")
            
            # Create a heatmap of numeric trends
            plt.figure(figsize=(14, 10))
            
            # Normalize data for better visualization
            norm_data = (numeric_trends - numeric_trends.min()) / (numeric_trends.max() - numeric_trends.min())
            
            sns.heatmap(norm_data.T, annot=False, cmap='viridis', cbar_kws={'label': 'Normalized Value'})
            plt.title('Heatmap of Numeric Features by Launch')
            plt.tight_layout()
            plt.savefig('eda_results/launch_numeric_heatmap.png')
        
        # Time-to-conversion metrics
        if 'target' in merged_data.columns and date_cols:
            print("\nAnalyzing time-to-conversion metrics...")
            
            # Find survey date and purchase date columns
            survey_date_col = None
            purchase_date_col = None
            
            # Try to identify appropriate date columns
            for col in date_cols:
                if any(term in col.lower() for term in ['survey', 'pesquisa', 'respuesta']):
                    survey_date_col = f"{col}_dt" if f"{col}_dt" in merged_data.columns else col
                elif any(term in col.lower() for term in ['purchase', 'venda', 'compra']):
                    purchase_date_col = f"{col}_dt" if f"{col}_dt" in merged_data.columns else col
            
            # If we couldn't identify specifically, use the first date column as survey date
            if survey_date_col is None and date_cols:
                survey_date_col = f"{date_cols[0]}_dt" if f"{date_cols[0]}_dt" in merged_data.columns else date_cols[0]
            
            # Calculate time to conversion if we have both dates
            if survey_date_col and purchase_date_col and merged_data['target'].sum() > 0:
                # Make sure both are datetime
                if merged_data[survey_date_col].dtype != 'datetime64[ns]':
                    merged_data[survey_date_col] = pd.to_datetime(merged_data[survey_date_col], errors='coerce')
                if merged_data[purchase_date_col].dtype != 'datetime64[ns]':
                    merged_data[purchase_date_col] = pd.to_datetime(merged_data[purchase_date_col], errors='coerce')
                
                # Calculate time to conversion for converted users
                converters = merged_data[merged_data['target'] == 1].copy()
                converters['time_to_conversion'] = (converters[purchase_date_col] - converters[survey_date_col]).dt.total_seconds() / (3600 * 24)  # in days
                
                # Filter out invalid values
                valid_ttc = converters[converters['time_to_conversion'] >= 0]
                
                if len(valid_ttc) > 0:
                    ttc_stats = valid_ttc['time_to_conversion'].describe()
                    
                    print("\nTime to conversion statistics (in days):")
                    print(f"  Mean: {ttc_stats['mean']:.2f}")
                    print(f"  Median: {ttc_stats['50%']:.2f}")
                    print(f"  Min: {ttc_stats['min']:.2f}")
                    print(f"  Max: {ttc_stats['max']:.2f}")
                    
                    # Plot time to conversion distribution
                    plt.figure(figsize=(10, 6))
                    sns.histplot(valid_ttc['time_to_conversion'], bins=20, kde=True)
                    plt.title('Time to Conversion Distribution (Days)')
                    plt.xlabel('Days')
                    plt.xlim(0, min(ttc_stats['75%'] * 3, ttc_stats['max']))  # Limit x-axis to 3x the 75th percentile
                    plt.tight_layout()
                    plt.savefig('eda_results/time_to_conversion.png')
                    
                    # Time to conversion by launch
                    if launch_col in valid_ttc.columns:
                        ttc_by_launch = valid_ttc.groupby(launch_col)['time_to_conversion'].agg(['mean', 'median', 'count'])
                        
                        print("\nTime to conversion by launch (in days):")
                        for launch, data in ttc_by_launch.iterrows():
                            if pd.notna(launch) and data['count'] >= 5:  # At least 5 conversions
                                print(f"  {launch}: Mean = {data['mean']:.2f}, Median = {data['median']:.2f}, Count = {data['count']}")
                        
                        # Plot median time to conversion by launch
                        plt.figure(figsize=(10, 6))
                        ttc_by_launch = ttc_by_launch.sort_index()
                        ttc_by_launch['median'].plot(kind='bar')
                        plt.title('Median Time to Conversion by Launch')
                        plt.ylabel('Days')
                        plt.tight_layout()
                        plt.savefig('eda_results/ttc_by_launch.png')
        
                # Análise de fluxo de leads entre lançamentos usando Sankey
        print("\nAnalisando fluxo de leads entre lançamentos...")

        # Verificar se temos informações suficientes para a análise
        if 'email_norm' in merged_data.columns and 'lançamento' in merged_data.columns and 'target' in merged_data.columns:
            try:
                import plotly.graph_objects as go
                import networkx as nx

                # 1. Identificar primeira aparição de cada lead
                # Ordenar dados para garantir que encontramos a primeira aparição
                # Assumindo que temos alguma coluna de data ou que os índices são cronológicos
                if date_cols:
                    date_col = date_cols[0]
                    if f"{date_col}_dt" in merged_data.columns:
                        date_col = f"{date_col}_dt"

                    # Criar coluna para ordenação
                    if merged_data[date_col].dtype != 'datetime64[ns]':
                        merged_data['temp_date'] = pd.to_datetime(merged_data[date_col], errors='coerce')
                    else:
                        merged_data['temp_date'] = merged_data[date_col]
                else:
                    # Se não temos data, usamos o índice como proxy para ordem cronológica
                    merged_data['temp_date'] = merged_data.index

                # Ordenar por email e data para pegar a primeira e última aparição de cada lead
                sorted_data = merged_data.sort_values(by=['email_norm', 'temp_date'])

                # Pegar a primeira aparição de cada lead em qualquer lançamento
                first_appearance = sorted_data.drop_duplicates(subset='email_norm', keep='first')[['email_norm', 'lançamento']]
                first_appearance.columns = ['email_norm', 'primeiro_lancamento']

                # Identificar onde cada lead converteu
                conversions = merged_data[merged_data['target'] == 1][['email_norm', 'lançamento']]
                conversions.columns = ['email_norm', 'lancamento_conversao']

                # Juntar informações de primeira aparição e conversão
                flow_data = pd.merge(first_appearance, conversions, on='email_norm', how='inner')

                # 2. Criar matriz de fluxo
                flow_matrix = pd.crosstab(
                    flow_data['primeiro_lancamento'], 
                    flow_data['lancamento_conversao'], 
                    dropna=False
                )

                # Preencher com zeros onde estiver faltando
                launch_list = sorted(merged_data['lançamento'].dropna().unique())
                for launch in launch_list:
                    if launch not in flow_matrix.index:
                        flow_matrix.loc[launch] = 0
                    if launch not in flow_matrix.columns:
                        flow_matrix[launch] = 0

                # Ordenar índices e colunas para melhor visualização
                flow_matrix = flow_matrix.reindex(index=launch_list, columns=launch_list, fill_value=0)

                # Mostrar matriz de fluxo
                print("\nMatriz de fluxo de leads entre lançamentos (origem → conversão):")
                print(flow_matrix)

                # 3. Criar dados para o diagrama Sankey
                source = []  # Lançamento de origem
                target = []  # Lançamento de conversão
                value = []   # Quantidade de leads

                # Criar mapeamento de lançamentos para índices
                launch_to_idx = {launch: i for i, launch in enumerate(launch_list)}

                # Preencher listas para o Sankey
                for origem in flow_matrix.index:
                    for destino in flow_matrix.columns:
                        if flow_matrix.loc[origem, destino] > 0:
                            source.append(launch_to_idx[origem])
                            target.append(launch_to_idx[destino] + len(launch_list))  # Offset para os nós de destino
                            value.append(float(flow_matrix.loc[origem, destino]))

                # Criar rótulos: primeiro os lançamentos de origem, depois os de destino
                labels = [f"{l} (Origem)" for l in launch_list] + [f"{l} (Conversão)" for l in launch_list]

                # Criar figura Sankey
                fig = go.Figure(data=[go.Sankey(
                    node=dict(
                        pad=15,
                        thickness=20,
                        line=dict(color="black", width=0.5),
                        label=labels,
                        color="blue"
                    ),
                    link=dict(
                        source=source,
                        target=target,
                        value=value
                    )
                )])

                fig.update_layout(
                    title_text="Fluxo de Leads entre Lançamentos",
                    font_size=10,
                    width=1000,
                    height=600
                )

                # Salvar como HTML interativo
                fig.write_html("eda_results/lead_flow_sankey.html")

                # Tentar salvar como imagem estática, com tratamento de erro robusto
                try:
                    fig.write_image("eda_results/lead_flow_sankey.png")
                    print("Diagrama Sankey salvo como 'eda_results/lead_flow_sankey.html' (interativo) e '.png' (estático)")
                except Exception as e:
                    print(f"Não foi possível salvar a imagem estática do diagrama Sankey: {e}")
                    print("O diagrama interativo foi salvo como 'eda_results/lead_flow_sankey.html'")
                    
                # 4. Análise adicional de conversão entre lançamentos
                print("\nResumo de conversões entre lançamentos:")

                # Total de conversões no mesmo lançamento (diagonal principal)
                same_launch_conv = sum(flow_matrix[col][col] for col in flow_matrix.columns if col in flow_matrix.index)

                # Total de conversões em lançamentos diferentes
                diff_launch_conv = flow_matrix.sum().sum() - same_launch_conv

                print(f"- Leads que converteram no mesmo lançamento: {same_launch_conv}")
                print(f"- Leads que converteram em um lançamento diferente: {diff_launch_conv}")

                if diff_launch_conv > 0:
                    # Para cada lançamento de origem, mostrar para onde foram os leads
                    for origem in launch_list:
                        destinos = flow_matrix.loc[origem]
                        total_from_origin = destinos.sum()

                        if total_from_origin > 0:
                            # Filtrar apenas conversões em outros lançamentos
                            other_launches = destinos[destinos.index != origem]
                            if other_launches.sum() > 0:
                                print(f"\nDe {origem}, {other_launches.sum()} lead(s) converteram em outros lançamentos:")
                                for destino, count in other_launches.items():
                                    if count > 0:
                                        pct = (count / total_from_origin) * 100
                                        print(f"  → {destino}: {count} leads ({pct:.1f}% dos leads de {origem})")

            except ImportError as e:
                print(f"Visualização Sankey requer bibliotecas adicionais: {e}")
                print("Execute 'pip install plotly' para habilitar esta visualização")

                # Cair para uma versão simplificada sem Sankey
                # Criar e mostrar apenas a matriz de fluxo
                flow_matrix = pd.crosstab(
                    flow_data['primeiro_lancamento'], 
                    flow_data['lancamento_conversao'], 
                    dropna=False
                )
                print("\nMatriz de fluxo de leads entre lançamentos (origem → conversão):")
                print(flow_matrix)

        else:
            print("Faltam colunas necessárias (email_norm, lançamento ou target) para análise de fluxo de leads.")
    
    print("\n=============================================")
    print("EDA SUMMARY")
    print("=============================================\n")
    
    print("Exploratory Data Analysis completed successfully!")
    print(f"Generated visualizations and insights saved to 'eda_results' directory.")
    
    if 'target' in merged_data.columns:
        conv_rate = merged_data['target'].mean() * 100
        print(f"\nOverall conversion rate: {conv_rate:.2f}%")
        print(f"Total conversions: {merged_data['target'].sum()} out of {len(merged_data)} records")
        
        # Summarize key insights
        print("\nKey insights:")
        
        # Summarize high correlation features
        if 'target_corr' in locals() and not target_corr.empty:
            top_positive = target_corr[target_corr > 0].drop('target', errors='ignore').nlargest(3)
            if not top_positive.empty:
                print("Top positive correlations with conversion:")
                for feat, corr in top_positive.items():
                    print(f"  - {feat}: {corr:.4f}")
        
        # Análise de países expandida (top 5)
        if 'country_conversion_filtered' in locals() and not country_conversion_filtered.empty:
            top_countries = country_conversion_filtered.head(5)  # Alterado de 3 para 5
            if not top_countries.empty:
                print("\nPaíses com melhor conversão (top 5):")
                for country, data in top_countries.iterrows():
                    print(f"  - {country}: {data['conversion_rate']:.2f}% ({data['sum']} conversões em {data['count']} registros)")
        
        # Summarize UTM insights
        utm_insights = []
        for utm_col in utm_cols:
            utm_var_name = f"{utm_col}_conversion"
            if utm_var_name in locals() and not locals()[utm_var_name].empty:
                utm_data = locals()[utm_var_name]
                if not utm_data.empty:
                    top_utm = utm_data.head(1)
                    for utm_val, data in top_utm.iterrows():
                        utm_insights.append((utm_col, utm_val, data['conversion_rate'], data['count']))
        
        if utm_insights:
            print("\nTop converting UTM values:")
            for utm_col, utm_val, rate, count in sorted(utm_insights, key=lambda x: x[2], reverse=True)[:3]:
                utm_display = str(utm_val)[:30] + '...' if isinstance(utm_val, str) and len(str(utm_val)) > 30 else utm_val
                print(f"  - {utm_col}: {utm_display} ({rate:.2f}%, {count} records)")
        
        # Summarize launch insights
        if 'launch_col' in locals() and launch_col in merged_data.columns:
            if 'launch_conversion' in locals() and not launch_conversion.empty:
                top_launches = launch_conversion.sort_values('conversion_rate', ascending=False).head(3)
                
                print("\nBest converting launches:")
                for launch, data in top_launches.iterrows():
                    if pd.notna(launch) and pd.notna(data['conversion_rate']):
                        print(f"  - {launch}: {data['conversion_rate']:.2f}% ({data['sum']} conversions out of {data['count']} records)")
else:
    print("No data to analyze. Please ensure the dataset is loaded properly.")

# Restaurar a saída original e salvar o relatório
sys.stdout = old_stdout

# Salvar relatório como arquivo de texto
with open('eda_results/eda_report.txt', 'w', encoding='utf-8') as f:
    f.write(report_content.getvalue())

print(f"EDA Report saved to 'eda_results/eda_report.txt'")