<a href="https://colab.research.google.com/github/samer-glitch/Trustworthy-AI-Data-Pipeline-Framework/blob/main/7_Data_Logging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import logging
import os

# Create a logger instance
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Check and clear existing handlers to avoid duplicated logs or formatting issues
if logger.hasHandlers():
    logger.handlers.clear()

# Create a file handler for logging
log_file_path = '/content/data_pipeline_report.txt'
file_handler = logging.FileHandler(log_file_path, mode='w')
file_handler.setLevel(logging.INFO)

# Create a custom formatter for output
formatter = logging.Formatter(
    '%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
file_handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(file_handler)

# Start of the data logging pipeline process
logger.info('Data Preprocessing Pipeline Log')

In [None]:
import pandas as pd

# Load dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)
logger.info('Dataset Loaded')
logger.info('Dataset loaded from: %s' % url)
logger.info('Initial dataset shape: %s' % str(df.shape))

# Snapshot of the first few rows of the dataset
logger.info('Initial Data Snapshot')
logger.info('\n%s' % df.head(4).to_string(index=False))

In [None]:
# Handling missing values in 'Age' column
missing_values_before = df['Age'].isnull().sum()
mean_age_value = df['Age'].mean()
logger.info('Handling Missing Values')
logger.info('Initial missing values in "Age" column: %d' % missing_values_before)
df['Age'] = df['Age'].fillna(mean_age_value)
logger.info('Missing values in "Age" column filled with mean value: %.2f' % mean_age_value)
missing_values_after = df['Age'].isnull().sum()
logger.info('Remaining missing values in "Age" column after imputation: %d' % missing_values_after)

In [None]:
# Removing duplicates
initial_row_count = df.shape[0]
df = df.drop_duplicates()
final_row_count = df.shape[0]
duplicates_removed = initial_row_count - final_row_count
logger.info('Removing Duplicates')
logger.info('Duplicates removed: %d. Rows before: %d, Rows after: %d' % (duplicates_removed, initial_row_count, final_row_count))

In [None]:
from sklearn.preprocessing import OneHotEncoder

# One-Hot Encoding the 'Sex', 'Embarked', and 'Cabin' columns
logger.info('One-Hot Encoding Categorical Columns')
categorical_columns = ['Sex', 'Embarked', 'Cabin']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

# Fit and transform
encoded_df = encoder.fit_transform(df[categorical_columns])

# Convert float values to integers for better representation
encoded_df = encoded_df.astype(int)

# Log the columns created by one-hot encoding
logger.info('Columns after one-hot encoding: %s' % encoded_df.columns.tolist())

# Concatenate encoded dataframe back to the main dataframe
df = pd.concat([df.drop(columns=categorical_columns), encoded_df], axis=1)

# Snapshot of the first few rows of the dataset after encoding
logger.info('Dataset after one-hot encoding:\n%s' % df.head(4).to_string(index=False))

In [None]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['Age', 'Fare']
logger.info('Scaling Numerical Features')
logger.info('Summary statistics before scaling:')
logger.info('\n%s' % df[numerical_columns].describe().to_string())

scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
logger.info('Numerical features scaled using StandardScaler.')
logger.info('Summary statistics after scaling:')
logger.info('\n%s' % df[numerical_columns].describe().to_string())

In [None]:
from imblearn.over_sampling import SMOTE

# Drop non-numerical columns to handle class imbalance properly
non_numerical_columns = ['Name', 'Ticket']
df = df.drop(columns=non_numerical_columns)

# Define features and target
logger.info('Handling Class Imbalance with SMOTE')
target = 'Survived'
X = df.drop(columns=[target])
y = df[target]

# Log class imbalance before SMOTE
logger.info('Original target class distribution:\n%s' % y.value_counts().to_string())

# Applying SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Log class imbalance after SMOTE
logger.info('Resampled target class distribution:\n%s' % pd.Series(y_resampled).value_counts().to_string())

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10]
}
rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_resampled, y_resampled)
logger.info('Hyperparameter Tuning and Cross-Validation')
logger.info('Performed hyperparameter tuning using GridSearchCV.')
logger.info('Best parameters found: %s' % grid_search.best_params_)
logger.info('Best cross-validation score: %.2f%%' % (grid_search.best_score_ * 100))

In [None]:
# Ensure log file is written to disk
for handler in logger.handlers:
    handler.flush()

# Verify if the log file is created using Python
if os.path.exists(log_file_path):
    # Read and print the content of the log file
    with open(log_file_path, 'r') as file:
        log_content = file.read()
    print("----- Log File Content -----")
    print(log_content)
else:
    print("Log file not found")

# Attempt to download the log file if it exists
from google.colab import files
if os.path.exists(log_file_path):
    files.download(log_file_path)
else:
    print("Cannot download the log file as it wasn't found.")

----- Log File Content -----
2024-10-30 07:42:38 - INFO - Data Preprocessing Pipeline Log
2024-10-30 07:43:17 - INFO - Dataset Loaded
2024-10-30 07:43:17 - INFO - Dataset loaded from: https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv
2024-10-30 07:43:17 - INFO - Initial dataset shape: (891, 12)
2024-10-30 07:43:17 - INFO - Initial Data Snapshot
2024-10-30 07:43:17 - INFO - 
 PassengerId  Survived  Pclass                                                Name    Sex  Age  SibSp  Parch           Ticket    Fare Cabin Embarked
           1         0       3                             Braund, Mr. Owen Harris   male 22.0      1      0        A/5 21171  7.2500   NaN        S
           2         1       1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38.0      1      0         PC 17599 71.2833   C85        C
           3         1       3                              Heikkinen, Miss. Laina female 26.0      0      0 STON/O2. 3101282  7.9250   NaN        S

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>