In [None]:
import unittest
import pandas as pd
from datetime import datetime
from unittest.mock import Mock, patch, MagicMock
from log_pipeline import ParquetReader, ValidatorProcessor, UpsertWriter, LogPipeline

In [None]:
test_data_1 = pd.DataFrame({
            'user_id': [1.0],
            'subscriber_id': [123.0],
            'session_id': ['session_1'],
            'hotel_id': [101],
            'request_id': ['req_1'],
            'funnel_id': ['funnel_1'],
            'timestamp': ['2024-10-19 10:00:00'],
            'country': ['TR'],
            'hotel_price': [100.50],
            'currency': ['USD'], 
            'payment_status': ['success'],
            'confirmation_number': ['CONF123'],  
            'user_agent': ['Mozilla/5.0'],  
            'device_type': ['desktop'],  
            'ip_address': ['192.168.1.1'],  
            'utm_source': ['google'],  
            'page_name': ['search'],  
            'search_query': ['istanbul hotel'],  
            'destination_id': [456.0],  
            'num_guests': [2.0],  
            'has_email_contact_permission': [True],
            'has_phone_contact_permission': [False]  
        })

test_data_2 = pd.DataFrame({
            'user_id': [1.0],
            'subscriber_id': [123.0],
            'session_id': ['session_1'],
            'hotel_id': [101],
            'request_id': ['req_1'],
            'funnel_id': ['funnel_1'],
            'timestamp': ['invalid_date'],  # Invalid
            'country': ['TR'],
            'hotel_price': [100.50],
            'currency': ['USD'],
            'payment_status': ['success'],
            'confirmation_number': ['CONF123'],
            'user_agent': ['Mozilla/5.0'],
            'device_type': ['desktop'],
            'ip_address': ['192.168.1.1'],
            'utm_source': ['google'],
            'page_name': ['search'],
            'search_query': ['istanbul hotel'],
            'destination_id': [456.0],
            'num_guests': [2.0],
            'has_email_contact_permission': [True],
            'has_phone_contact_permission': [False]
        })

test_data3 = pd.DataFrame({
            'user_id': [1.0],
            'session_id': ['session_1'],
            'hotel_id': [101],
            'request_id': ['req_1'],
            'funnel_id': ['funnel_1'],
            'timestamp': ['2024-10-19 10:00:00']
        })
        
test_data_4 = pd.DataFrame({
            'user_id': [1, 2],
            'country': ['TR', 'US'],
            'updated_Date': [datetime.now(), datetime.now()]
        })

test_data5 = pd.DataFrame({
            'user_id': list(range(1, 2501)),  # 2500 records
            'country': ['TR'] * 2500,
            'updated_Date': [datetime.now()] * 2500
        })

In [None]:
class TestParquetReader(unittest.TestCase):
    
    @patch('pandas.read_parquet')
    def test_read_all(self, mock_read_parquet):
        # Mock data
        mock_df = pd.DataFrame({
            'user_id': [1, 2],
            'session_id': ['s1', 's2'],
            'hotel_id': [101, 102]
        })
        mock_read_parquet.return_value = mock_df
        
        reader = ParquetReader('dummy_path.parquet')
        result = reader.read_all()
        
        self.assertEqual(len(result), 2)
        mock_read_parquet.assert_called_once_with('dummy_path.parquet')

    @patch('pandas.read_parquet')
    def test_read_in_batches(self, mock_read_parquet):
        # Mock data
        mock_df = pd.DataFrame({
            'user_id': [1, 2, 3, 4, 5],
            'session_id': ['s1', 's2', 's3', 's4', 's5'],
            'hotel_id': [101, 102, 103, 104, 105]
        })
        mock_read_parquet.return_value = mock_df
        
        reader = ParquetReader('dummy_path.parquet')
        batches = list(reader.read_in_batches(batch_size=2))
        
        self.assertEqual(len(batches), 3)  # 5 records in batches of 2 = 3 batches
        self.assertEqual(len(batches[0]), 2)
        self.assertEqual(len(batches[1]), 2) 
        self.assertEqual(len(batches[2]), 1)

class TestValidatorProcessor(unittest.TestCase):
    
    def setUp(self):
        self.validator = ValidatorProcessor()
        
    def test_validate_valid_data(self):
        # Valid test data 
        users, sessions, events, hotels, payments = self.validator.validate(test_data_1)
        
        self.assertEqual(len(users), 1)
        self.assertEqual(len(sessions), 1)
        self.assertEqual(len(events), 1)
        self.assertEqual(len(hotels), 1)
        self.assertEqual(len(payments), 1)
        
    def test_validate_invalid_timestamp(self):
        # Invalid timestamp should be filtered out
        users, sessions, events, hotels, payments = self.validator.validate(test_data_2)
        # Should have no valid events due to invalid timestamp
        self.assertEqual(len(events), 0)
        # But other tables should still have data
        self.assertEqual(len(users), 1)
        self.assertEqual(len(sessions), 1)

    def test_validate_minimal_data(self):
        # Test with minimal required fields only
        users, sessions, events, hotels, payments = self.validator.validate(test_data3)
        # Should fail validation due to missing required fields
        self.assertEqual(len(users), 0)  # Missing subscriber_id, permissions
        self.assertEqual(len(events), 0)  # Missing some fields
        self.assertEqual(len(sessions), 0)  # Missing some fields

class TestUpsertWriter(unittest.TestCase):
    
    def setUp(self):
        self.mock_engine = Mock()
        self.writer = UpsertWriter(self.mock_engine)
        
    def test_upsert_empty_dataframe(self):
        # Empty DataFrame should not execute any SQL
        empty_df = pd.DataFrame()
        self.writer.upsert_df(empty_df, 'users')
        self.mock_engine.begin.assert_not_called()
        
    def test_upsert_valid_dataframe(self):
        # Valid DataFrame should execute SQL
        # Properly mock the context manager
        mock_conn = Mock()
        mock_context = MagicMock()
        mock_context.__enter__.return_value = mock_conn
        mock_context.__exit__.return_value = None
        self.mock_engine.begin.return_value = mock_context
        self.writer.upsert_df(test_data_4, 'users')
        self.mock_engine.begin.assert_called_once()
        mock_conn.execute.assert_called()
        
    def test_upsert_with_chunking(self):
        # Test chunking functionality
        # Create a DataFrame larger than chunk size
        mock_conn = Mock()
        mock_context = MagicMock()
        mock_context.__enter__.return_value = mock_conn
        mock_context.__exit__.return_value = None
        self.mock_engine.begin.return_value = mock_context
        self.writer.upsert_df(test_data5, 'users', chunk_size=1000)
        # Should call execute 3 times (2500 / 1000 = 3 chunks)
        self.assertEqual(mock_conn.execute.call_count, 3)

class TestLogPipeline(unittest.TestCase):
    
    def setUp(self):
        self.mock_engine = Mock()
        
    @patch('log_pipeline.ParquetReader')
    @patch('log_pipeline.ValidatorProcessor')
    @patch('log_pipeline.UpsertWriter')
    def test_run_once(self, mock_writer_class, mock_validator_class, mock_reader_class):
        # Setup mocks
        mock_reader = Mock()
        mock_validator = Mock()
        mock_writer = Mock()
        
        mock_reader_class.return_value = mock_reader
        mock_validator_class.return_value = mock_validator
        mock_writer_class.return_value = mock_writer
        
        # Mock data
        mock_df = pd.DataFrame({
            'user_id': [1.0],
            'session_id': ['s1'],
            'hotel_id': [101],
            'request_id': ['req1'],
            'funnel_id': ['f1']
        })
        mock_reader.read_all.return_value = mock_df
        
        # Mock validator returns
        mock_validator.validate.return_value = (
            pd.DataFrame({'user_id': [1]}),  # users
            pd.DataFrame({'session_id': ['s1']}),  # sessions  
            pd.DataFrame({'request_id': ['req1']}),  # events
            pd.DataFrame({'hotel_id': [101]}),  # hotels
            pd.DataFrame({'request_id': ['req1']})  # payments
        )
        
        # Create pipeline and run
        pipeline = LogPipeline('dummy_path.parquet', self.mock_engine)
        pipeline.run_once()
        
        # Verify calls
        mock_reader.read_all.assert_called_once()
        mock_validator.validate.assert_called_once()
        self.assertEqual(mock_writer.upsert_df.call_count, 5)  # 5 tables

if __name__ == '__main__':
    # Test runner
    unittest.main(argv=[''], exit=False, verbosity=2)

test_run_once (__main__.TestLogPipeline) ... ok
test_read_all (__main__.TestParquetReader) ... ok
test_read_in_batches (__main__.TestParquetReader) ... ok
test_upsert_empty_dataframe (__main__.TestUpsertWriter) ... ok
test_upsert_valid_dataframe (__main__.TestUpsertWriter) ... ok
test_upsert_with_chunking (__main__.TestUpsertWriter) ... ok
test_validate_invalid_timestamp (__main__.TestValidatorProcessor) ... ok
test_validate_minimal_data (__main__.TestValidatorProcessor) ... ok
test_validate_valid_data (__main__.TestValidatorProcessor) ... 

Validation error: 1 validation error for EventModel
timestamp
  Value error, timestamp geçersiz [type=value_error, input_value='invalid_date', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/value_error
Validation error: 4 validation errors for UserModel
subscriber_id
  Field required [type=missing, input_value={'user_id': 1.0, 'session...: '2024-10-19 10:00:00'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
country
  Field required [type=missing, input_value={'user_id': 1.0, 'session...: '2024-10-19 10:00:00'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
has_email_contact_permission
  Field required [type=missing, input_value={'user_id': 1.0, 'session...: '2024-10-19 10:00:00'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
has_phone_contact_permission
  Field required [type=missing, input_value={'user_i

ok

----------------------------------------------------------------------
Ran 9 tests in 0.017s

OK
