## Data Splitting Trial-runs 

In [17]:
import os 

In [18]:
os.chdir(r"C:\Users\USER\Desktop\MLDefaults\Rising-Village-Prediction-Model")

## Trial-runs for entity_config file 

In [19]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataSplittingConfig:
    root_dir: Path
    processed_data_file: Path
    train_set_path: Path
    test_set_path: Path
    test_size: float
    random_state: int

## Trial-runs for configuration_file

In [20]:
#importing all project paths and modules necessary for project configurations 
from raisingVillage.constants import  *
from raisingVillage.utils.common import read_yaml, create_directories

In [21]:
#Updating the configuration file 
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH, 
        params_filepath = PARAMS_FILE_PATH,
        selected_schema_filepath = SELECTED_SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.selected_schema = read_yaml(selected_schema_filepath)
       
        create_directories([self.config.artifacts_root])
    
    def get_data_splitting_config(self) -> DataSplittingConfig:
        config = self.config.data_splitting
        data_transformation_config = self.config.data_transformation
        params = self.params.data_splitting
        
        create_directories([config.root_dir])
        
        data_splitting_config = DataSplittingConfig(
            root_dir=config.root_dir,
            processed_data_file=data_transformation_config.processed_data_file, 
            train_set_path=config.train_set_path, 
            test_set_path=config.test_set_path, 
            test_size=float(params.test_size),
            random_state=int(params.random_state)    
        )
        return data_splitting_config

## Trial-runs for components 

In [22]:
import pandas as pd
import os
from pathlib import Path
from raisingVillage import logger
from sklearn.model_selection import train_test_split

In [23]:
class DataSplitting: 
    def __init__(self, config: DataSplittingConfig):
        self.config = config
        #You can add all the different transformation techniques needed before splitting the data. 
        
    def train_test_splitting(self):
        data = pd.read_csv(self.config.processed_data_file)
    #Split the data into training and test datasets 
        train, test = train_test_split(
            data,
            test_size=self.config.test_size,
            random_state=self.config.random_state 
            )
        train.to_csv(os.path.join(self.config.root_dir, "train_set.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test_set.csv"), index=False)
        
        logger.info("Splitted data into train and test sets")
        logger.info(f"Train set saved to {self.config.train_set_path}")
        logger.info(f"Test set saved to {self.config.test_set_path}")


## trial-runs for pipeline 

In [24]:
try:
    config = ConfigurationManager()
    data_splitting_config =config.get_data_splitting_config()
    data_splitting = DataSplitting(config=data_splitting_config)
    data_splitting.train_test_splitting()
except Exception as e:
    raise e

[2025-05-29 11:57:43,254: INFO: common: yaml_file: config\config.yaml loaded successfully]
[2025-05-29 11:57:43,262: INFO: common: yaml_file: params.yaml loaded successfully]
[2025-05-29 11:57:43,271: INFO: common: yaml_file: selected_schema.yaml loaded successfully]
[2025-05-29 11:57:43,275: INFO: common: Created directory at: artifacts]
[2025-05-29 11:57:43,279: INFO: common: Created directory at: artifacts/data_splitting]


[2025-05-29 11:57:43,335: INFO: 3128332124: Splitted data into train and test sets]
[2025-05-29 11:57:43,339: INFO: 3128332124: Train set saved to artifacts/data_splitting/train_set.csv]
[2025-05-29 11:57:43,343: INFO: 3128332124: Test set saved to artifacts/data_splitting/train_set.csv]
