# End-to-End Machine Learning Workflow with KizenML, XAI, and Cloud Deployment

# Data preprocessing & Visualization using Auto EDA

In [None]:
!pip uninstall pylint

In [None]:
!pip install --upgrade pandas
!pip install seaborn missingno
!pip install sweetviz
!pip install --upgrade autoviz
!pip install --upgrade jinja2
!pip install featuretools
!pip install tidypy
!pip install pylint==2.11.1



In [None]:
# Import necessary libraries
import pandas as pd
import sweetviz as sv
import numpy as np
import warnings
import tidypy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import seaborn as sns
from autoviz.AutoViz_Class import AutoViz_Class
import missingno as msno

  from pandas.core import (


In [None]:
# Hide all warnings
warnings.filterwarnings('ignore')

# Data Collection and Preprocessing (4 Marks):

In [None]:
# Load the Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic_df = pd.read_csv(url)

# Inspect the data
print(titanic_df.head())
print(titanic_df.info())



# Detecting issues using TidyPy

In [None]:
# Run tidypy to check the dataset for any issues
issues = tidypy.check(['titanic_df.csv'])

# Print detected issues
print(issues)

# Data Cleaning

In [None]:
# Drop columns that won't be useful for prediction (e.g., 'PassengerId', 'Name', 'Ticket', 'Cabin')
titanic_df = titanic_df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# Handle missing values
# For numerical columns, we can use the median for imputation
# For categorical columns, we can use the most frequent value for imputation

numerical_features = ['Age', 'Fare']
categorical_features = ['Embarked', 'Sex']

# Feature Engineering

In [None]:
# Create new features (e.g., family size)
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch']

# Drop SibSp and Parch as FamilySize is more informative
titanic_df = titanic_df.drop(columns=['SibSp', 'Parch'])


# Preprocessing Pipelines

In [None]:

# Numerical pipeline: Impute missing values and scale the features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: Impute missing values and apply One-Hot Encoding
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Split the data into features (X) and target (y)
X = titanic_df.drop(columns=['Survived'])
y = titanic_df['Survived']

# Apply the preprocessing to the data
X_preprocessed = preprocessor.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Output the transformed features
print("Preprocessed Train Data Shape:", X_train.shape)
print("Preprocessed Test Data Shape:", X_test.shape)

# AutoEDA using sweetviz

In [None]:
# Generate a Sweetviz report
report = sv.analyze(titanic_df)

# Save the report to an HTML file
report.show_html("titanic_sweetviz_report.html")

# Auto EDA using Autoviz

In [None]:
autoviz = AutoViz_Class()
autoviz_report = autoviz.AutoViz(filename='', dfte=titanic_df, depVar='', verbose=0)

# Feature Engineering

###### Automated Feature Engineering: The code leverages Featuretools to automatically generate new features from the dataset based on relationships between columns.
######  EntitySet: Structures the data for feature synthesis.
######  Deep Feature Synthesis (DFS): Automatically creates new features that are combinations or transformations of the original data.

In [None]:
import featuretools as ft
import pandas as pd

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic_df = pd.read_csv(url)



num_features = titanic_df.shape[1]
print(f"Number of features: {num_features}")
# Print the column names
print("Column names:", titanic_df.columns)

# Define entityset
es = ft.EntitySet(id='data')

# Add dataframe to entityset
es = es.add_dataframe(
    dataframe_name='data',
    dataframe=titanic_df,
    index='PassengerId'
)

# Define features
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name='data'
)


print('After selecting features')
num_features = feature_matrix.shape[1]
print(f"Number of features: {num_features}")
# Print the column names
print("Column names:", feature_matrix.columns)
