In [1]:
import os

In [2]:
%pwd

'e:\\Projects for portfolio\\Exoplanet Chatbot\\research'

In [3]:
#Since I want to get into the root directory i.e Text Summarization using Hugging Face API
os.chdir("../")

In [None]:
%pwd

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig: # defined for the config components present in artifacts for data transformation
    root_dir : Path 
    data_path : Path
    data_path_transformed : Path

In [5]:
# Configuration manager
from exoplanet_chatbot.constants import *
from exoplanet_chatbot.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
    # Here we are reading the yaml file and we can now use the file paths present inside pararms and config.yaml        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) # Here we are calling the artifacts_root key values using '.' , which was the purpose of @ensure_annotations

    def get_data_transformation_config(self) -> DataTransformationConfig: # Here we are using the entity to specify the return type classes to make sure proper output is returned
        config= self.config.data_transformation # Calling the data_validation dictionary created in config.yaml file

        create_directories([config.root_dir]) # Creating a directory using the root directory

        data_transformation_config = DataTransformationConfig( # Extracting the values from the config.yaml to here inside data_ingestion_config
            root_dir=config.root_dir,
            data_path=config.data_path,
            data_path_transformed=config.data_path_transformed
        )

        return data_transformation_config

In [6]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
from exoplanet_chatbot.logging import logger

In [10]:
class DataTransformation:
    def __init__(self,config:DataTransformationConfig): # It will take the configuration from DataIngestionConfig defined earlier , which will in turn use Configuration Manager to take data from config.yaml
        self.config = config
    
    def feature_selection(self,data):

        columns_to_include = ['Planet Name','Host Name','Number of Stars','Number of Planets','Number of Moons','Circumbinary Flag','Discovery Method', 'Discovery Year',
       'Discovery Publication Date','Discovery Facilty','Discovery Telescope','Planet Radius [Earth Radius]',
       'Planet Mass [Earth Mass]','Planet Density [g/cm**3]','Equilibrium Temperature [K]','Orbit Semi-Major Axis [au]',
       'Radial Velocity Amplitude [m/s]','Stellar Effective Temperature [K]','Stellar Radius [Solar Radius]','Stellar Mass [Solar mass]']
        
        return data[columns_to_include]

    def feature_engineering(self,data):

        data.rename(columns={'Circumbinary Flag' : 'Binary System'},inplace=True)
        data['Binary System'] = data['Binary System'].map({0 : 'Binary System', 1 : 'Not Binary System'})
        return data
    
    def null_value_handling(self,data):

        data.fillna("Not found Yet!!",inplace=True)
        return data

    def context_generator(self,row):

        context = (
        f"The exoplanet {row['Planet Name']} orbits the host star {row['Host Name']}. "
        f"The system containing the exoplanet {row['Planet Name']} also contains {row['Number of Stars']} stars, {row['Number of Planets']} planets, and {row['Number of Moons']} moons. "
        f"The exoplanet lies in a {row['Binary System']}. "
        f"The planet was discovered using the {row['Discovery Method']} method in {row['Discovery Year']}. "
        f"The discovery was published on {row['Discovery Publication Date']} and facilitated by {row['Discovery Facilty']} using the {row['Discovery Telescope']}. "
        f"The planet has a radius of {row['Planet Radius [Earth Radius]']} Earth radii, a mass of {row['Planet Mass [Earth Mass]']} Earth masses, "
        f"and a density of {row['Planet Density [g/cm**3]']} g/cm³. "
        f"The equilibrium temperature is {row['Equilibrium Temperature [K]']} K. "
        f"The semi-major axis of its orbit is {row['Orbit Semi-Major Axis [au]']} AU. "
        f"The radial velocity amplitude is {row['Radial Velocity Amplitude [m/s]']} m/s. "
        f"The host star has an effective temperature of {row['Stellar Effective Temperature [K]']} K, "
        f"a radius of {row['Stellar Radius [Solar Radius]']} solar radii, and a mass of {row['Stellar Mass [Solar mass]']} solar masses."
    )
        return context
    
    def instruction_pair_generator(self,data):

        instruction_context_response_pairs = []

        for index, row in data.iterrows():
            planet_name = row['Planet Name']
            context = row['Context']
            features = {
                'Number of Stars': row['Number of Stars'],
                'Number of Planets': row['Number of Planets'],
                'Number of Moons': row['Number of Moons'],
                'Binary System': row['Binary System'],
                'Discovery Method': row['Discovery Method'],
                'Discovery Year': row['Discovery Year'],
                'Discovery Publication Date': row['Discovery Publication Date'],
                'Discovery Facility': row['Discovery Facilty'],
                'Discovery Telescope': row['Discovery Telescope'],
                'Planet Radius': row['Planet Radius [Earth Radius]'],
                'Planet Mass': row['Planet Mass [Earth Mass]'],
                'Planet Density': row['Planet Density [g/cm**3]'],
                'Equilibrium Temperature': row['Equilibrium Temperature [K]'],
                'Orbit Semi-Major Axis': row['Orbit Semi-Major Axis [au]'],
                'Radial Velocity Amplitude': row['Radial Velocity Amplitude [m/s]'],
                'Stellar Effective Temperature': row['Stellar Effective Temperature [K]'],
                'Stellar Radius': row['Stellar Radius [Solar Radius]'],
                'Stellar Mass': row['Stellar Mass [Solar mass]']
            }

            for feature, value in features.items():
                instruction = f"What is the {feature.lower().replace('_', ' ')} of {planet_name}?"
                response = f"The {feature.lower().replace('_', ' ')} of {planet_name} is {value}."
                instruction_context_response_pairs.append({
                    "instruction": instruction,
                    "input": context,
                    "output": response
                })
        
        return pd.DataFrame(instruction_context_response_pairs)

    def transform(self):

        # Reading the data
        data = pd.read_csv(self.config.data_path)

        # Feature selection
        feature_selected_data = self.feature_selection(data)

        # Null value handling
        null_value_handled_data = self.null_value_handling(feature_selected_data)

        # Feature Engineering
        feature_engineered_data = self.feature_engineering(null_value_handled_data)

        # Context generation
        feature_engineered_data['Context'] = feature_engineered_data.apply(self.context_generator, axis=1)

        # Instruction pair generation
        instruction_context_response_pairs = self.instruction_pair_generator(feature_engineered_data)

        # Saving the data
        instruction_context_response_pairs.to_csv(self.config.data_path_transformed, index=False)


In [11]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config() # Storing the configuration
    data_transformation = DataTransformation(config=data_transformation_config) # Using the configuration saved earlier to call model_transformation
    data_transformation.transform()
except Exception as e:
    raise e

[2024-06-16 00:02:44,769: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-16 00:02:44,770: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-16 00:02:44,772: INFO: common: created directory at: artifacts]
[2024-06-16 00:02:44,772: INFO: common: created directory at: artifacts/data_transformation]


  data.fillna("Not found Yet!!",inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna("Not found Yet!!",inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'Circumbinary Flag' : 'Binary System'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Binary System'] = data['Binary System'].map({0 : 'Binary System', 1 : 'Not Binary System'})
A value is trying to be set on a copy of a slice from a D