**Extract (Data Ingestion)**

In [1]:
import pandas as pd

def load_dataset(data):
  file_extension = data.split('.')[-1].lower()

  if file_extension == 'csv':
    df = pd.read_csv(data)
  elif file_extension in['xls', 'xlsx']:
    df = pd.read_excel(data)
  elif file_extension == 'json':
    df = pd.read_json(data)
  else :
    raise ValueError(f"Unsupported file extension")

  return df



**Transform( Data Preprocessing & Feature Engineering)**

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA


def preprocess_data(df):
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    # Initial Data Validation
    print(f"Initial data shape: {df.shape}")
    print(f"Numerical columns: {numerical_cols}")
    print(f"Categorical columns: {categorical_cols}")

    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

    # Validate after imputation
    print("Missing values after imputation:", df.isnull().sum().sum())

    # Scale features of numerical columns
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Encode categorical features
    df = pd.get_dummies(df, columns=categorical_cols)

    # Validate after preprocessing
    print(f"Data shape after preprocessing: {df.shape}")
    return df

def feature_engineering(df, target_column=None):
    # Separate input and target variables
    if target_column:
        X = df.drop(columns=[target_column])
        y = df[target_column]
    else:
        X = df
        y = None

    # Adding polynomial features
    poly_features = X ** 2
    poly_features.columns = [f"{col}^2" for col in X.columns]
    df = pd.concat([df, poly_features], axis=1)

    # Validate after adding polynomial features
    print(f"Number of features after polynomial expansion: {df.shape[1]}")

    # Dimensionality Reduction using PCA
    if X.shape[1] > 1:  # PCA requires more than one feature
        pca = PCA(n_components=2)
        pca_components = pca.fit_transform(X)
        df['PCA1'] = pca_components[:, 0]
        df['PCA2'] = pca_components[:, 1]
        print(f"Explained variance ratio by PCA: {pca.explained_variance_ratio_}")
    else:
        print("Not enough features for PCA.")

    # Validate after feature engineering
    print(f"Data shape after feature engineering: {df.shape}")
    return df


**Load Transformed Data**

In [3]:
import pandas as pd

def save_data(df, output):
    file_extension = output.split('.')[-1].lower()

    if file_extension == 'csv':
        df.to_csv(output, index=False)
    elif file_extension in ['xls', 'xlsx']:
        df.to_excel(output, index=False)
    elif file_extension == 'json':
        df.to_json(output, orient='records')
    else:
        raise ValueError(f"Unsupported file extension: {file_extension}")

    print(f"Data saved successfully to {output}")



**Orchestrate ETL Pipeline**

In [4]:
def etl_pipeline(input_data, output_data):

  df = load_dataset(input_data)
  df = preprocess_data(df)
  df = feature_engineering(df)
  save_data(df, output_data)
  print("ETL pipeline completed successfully")

**Usage**

In [5]:

input_data = "/content/cleaned_star_data.csv"
output_data = "/content/test_dataset1.csv"

etl_pipeline(input_data, output_data)


Initial data shape: (240, 7)
Numerical columns: Index(['Star type'], dtype='object')
Categorical columns: Index(['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
       'Absolute magnitude(Mv)', 'Star color', 'Spectral Class'],
      dtype='object')
Missing values after imputation: 6
Data shape after preprocessing: (240, 880)
Number of features after polynomial expansion: 1760
Explained variance ratio by PCA: [0.19562405 0.07167648]
Data shape after feature engineering: (240, 1762)
Data saved successfully to /content/test_dataset1.csv
ETL pipeline completed successfully


**Input Dataset**

In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/cleaned_star_data.csv")
df.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,,,,,,,
1,3042.0,0.0005,0.1542,16.6,0.0,Red,M
2,2600.0,0.0003,0.102,18.7,0.0,Red,M
3,2800.0,0.0002,,16.65,0.0,Red,M
4,1939.0,0.000138,0.103,20.06,0.0,Red,M


In [7]:
df.isnull().sum()

Unnamed: 0,0
Temperature (K),1
Luminosity(L/Lo),1
Radius(R/Ro),1
Absolute magnitude(Mv),1
Star type,1
Star color,1
Spectral Class,1


**Output Dataset**

In [8]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/test_dataset1.csv")
df.head()

Unnamed: 0,Star type,Temperature (K)_,Temperature (K)_10012,Temperature (K)_10574,Temperature (K)_10930,Temperature (K)_10980,Temperature (K)_11000,Temperature (K)_11096,Temperature (K)_11250,Temperature (K)_11567,...,Spectral Class_ ^2,Spectral Class_A^2,Spectral Class_B^2,Spectral Class_F^2,Spectral Class_G^2,Spectral Class_K^2,Spectral Class_M^2,Spectral Class_O^2,PCA1,PCA2
0,0.0,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0.181036,-0.308857
1,-1.476609,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,1,0,-1.705404,0.011683
2,-1.476609,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,1,0,-1.711066,0.017356
3,-1.476609,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,1,0,-1.699906,0.01282
4,-1.476609,False,False,False,False,False,False,False,False,False,...,0,0,0,0,0,0,1,0,-1.705245,0.017165


In [9]:
print(df.isnull().sum())


Star type                0
Temperature (K)_         0
Temperature (K)_10012    0
Temperature (K)_10574    0
Temperature (K)_10930    0
                        ..
Spectral Class_K^2       0
Spectral Class_M^2       0
Spectral Class_O^2       0
PCA1                     0
PCA2                     0
Length: 1762, dtype: int64
