In [5]:
#  necessary libraries
!pip install pandas sqlalchemy openpyxl pymysql

import pandas as pd
import os
from sqlalchemy import create_engine

#  sample CSV data
csv_data = pd.DataFrame({
    'Train Number': [12345, 67890, 13579],
    'Train Name': ['Rajdhani Express', 'Shatabdi Express', 'Duronto Express'],
    'Departure Time': ['06:00', '08:30', '22:15'],
    'Arrival Time': ['18:00', '14:00', '06:45'],
    'Source': ['Delhi', 'Mumbai', 'Kolkata'],
    'Destination': ['Mumbai', 'Chennai', 'Delhi']
})
csv_data.to_csv("train_schedule_zone1.csv", index=False)

#  sample Excel data
excel_data = pd.DataFrame({
    'Train Number': [24680, 11223, 44556],
    'Train Name': ['Garib Rath', 'Jan Shatabdi', 'Tejas Express'],
    'Departure Time': ['09:15', '11:45', '17:30'],
    'Arrival Time': ['21:30', '19:20', '23:50'],
    'Source': ['Bangalore', 'Pune', 'Hyderabad'],
    'Destination': ['Kolkata', 'Delhi', 'Mumbai']
})
excel_data.to_excel("train_schedule_zone2.xlsx", index=False)

#  sample SQL database
engine = create_engine("sqlite:///train_schedule.db")
sql_data = pd.DataFrame({
    'Train Number': [55678, 99887, 33445],
    'Train Name': ['Vande Bharat', 'Humsafar Express', 'Mahamana Express'],
    'Departure Time': ['05:45', '12:10', '19:55'],
    'Arrival Time': ['15:20', '22:40', '04:30'],
    'Source': ['Jaipur', 'Ahmedabad', 'Chennai'],
    'Destination': ['Lucknow', 'Bhopal', 'Varanasi']
})
sql_data.to_sql("train_schedule_zone3", con=engine, if_exists="replace", index=False)

# Function to read CSV files
def read_csv(file_path):
    return pd.read_csv(file_path)

# Function to read Excel files
def read_excel(file_path):
    return pd.read_excel(file_path)

# Function to read SQL database
def read_sql(db_connection_string, table_name):
    engine = create_engine(db_connection_string)
    return pd.read_sql(f"SELECT * FROM {table_name}", con=engine)

# Function to clean and standardize data
def clean_data(df):
    df.columns = df.columns.str.lower().str.replace(" ", "_")  # Standardize column names
    df.drop_duplicates(inplace=True)  # Remove duplicate entries
    df.dropna(inplace=True)  # Drop missing values
    if 'train_number' in df.columns:
        df['train_number'] = df['train_number'].astype(str).str.zfill(5)  # Standardize train number format
    return df

# Function to merge datasets
def merge_datasets(dfs):
    return pd.concat(dfs, ignore_index=True)

# Load Data
csv_data = read_csv("train_schedule_zone1.csv")
excel_data = read_excel("train_schedule_zone2.xlsx")
sql_data = read_sql("sqlite:///train_schedule.db", "train_schedule_zone3")

# Clean Data
csv_data = clean_data(csv_data)
excel_data = clean_data(excel_data)
sql_data = clean_data(sql_data)

# Merge Data
final_dataset = merge_datasets([csv_data, excel_data, sql_data])

# Save to CSV
final_dataset.to_csv("standardized_train_schedule.csv", index=False)

print("ETL pipeline executed successfully. Standardized train schedule saved.")

ETL pipeline executed successfully. Standardized train schedule saved.
