In [1]:
import os
os.chdir("../")

In [2]:
from pydantic import BaseModel
from pathlib import Path

class DataIngestionConfig(BaseModel):
    ## config
    ingestion_dir: Path
    collection_name: str
    database_name: str
    file_name: str

In [3]:
from src.NetworkSecurity.constants import *
from src.NetworkSecurity.utils.common import read_yaml,create_directories

## reads from config/config.yaml
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self)->DataIngestionConfig:

        config = self.config.data_ingestion

        # create artifacts/data_ingestion
        create_directories([config.ingestion_dir])

        ##return data_ingestion_config object which is validated
        data_ingestion_config = DataIngestionConfig(

            ingestion_dir = config.ingestion_dir,
            collection_name = config.collection_name,
            database_name = config.database_name,
            file_name = config.file_name
        )

        return data_ingestion_config

In [5]:
cm = ConfigurationManager()
data_ingestion_config = cm.get_data_ingestion_config()

[2025-12-09 12:38:04,602: INFO: common : Yaml File: config/config.yaml loaded successfully]
[2025-12-09 12:38:04,604: INFO: common : Yaml File: params.yaml loaded successfully]
[2025-12-09 12:38:04,613: INFO: common : Yaml File: schema.yaml loaded successfully]
[2025-12-09 12:38:04,614: INFO: common : created directory at: artifacts]
[2025-12-09 12:38:04,615: INFO: common : created directory at: artifacts/data_ingestion]


In [6]:
from src.NetworkSecurity.components.data_ingestion import DataIngestion

di = DataIngestion(data_ingestion_config)
di.download_file()

[2025-12-09 12:38:10,427: INFO: data_ingestion : Fetching data from MongoDB collection: NetworkData]
[2025-12-09 12:38:18,382: INFO: data_ingestion : âœ… Data successfully downloaded and saved to artifacts/data_ingestion/datafromDB.csv]


'artifacts/data_ingestion/datafromDB.csv'

In [7]:
import pandas as pd
data = pd.read_csv("artifacts/data_ingestion/datafromDB.csv")
print(data.dtypes)

NumDots                               float64
SubdomainLevel                        float64
PathLevel                             float64
UrlLength                             float64
NumDash                               float64
NumDashInHostname                     float64
AtSymbol                              float64
TildeSymbol                           float64
NumUnderscore                         float64
NumPercent                            float64
NumQueryComponents                    float64
NumAmpersand                          float64
NumHash                               float64
NumNumericChars                       float64
NoHttps                               float64
RandomString                          float64
IpAddress                             float64
DomainInSubdomains                    float64
DomainInPaths                         float64
HttpsInHostname                       float64
HostnameLength                        float64
PathLength                        

In [8]:
from pathlib import Path
from pydantic import BaseModel

class DataValidationConfig(BaseModel):

    root_dir: Path
    ingestion_file: str
    STATUS_FILE: Path
    all_schema: dict

In [11]:
from src.NetworkSecurity.constants import *
from src.NetworkSecurity.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self)->DataValidationConfig:

        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            ingestion_file = config.ingestion_file,
            STATUS_FILE = config.STATUS_FILE,
            all_schema = schema
        )

        return data_validation_config

In [12]:
cm = ConfigurationManager()
data_validation_config = cm.get_data_validation_config()

[2025-12-09 12:42:32,412: INFO: common : Yaml File: config/config.yaml loaded successfully]
[2025-12-09 12:42:32,413: INFO: common : Yaml File: params.yaml loaded successfully]
[2025-12-09 12:42:32,417: INFO: common : Yaml File: schema.yaml loaded successfully]
[2025-12-09 12:42:32,418: INFO: common : created directory at: artifacts]
[2025-12-09 12:42:32,419: INFO: common : created directory at: artifacts/data_validation]


In [13]:
from src.NetworkSecurity.logging.logger import logger

class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        """
        root_dir: Path => config
		    STATUS_FILE: str => config
		    unzip_data_dir: Path => config
		    all_schema: dict => schema 
        """

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True  # Assume valid unless proven wrong
            
            ## Read CSV
            data = pd.read_csv(self.config.ingestion_file)

            ## Extract all columns and schema
            all_cols = list(data.columns)
            all_schema = set(self.config.all_schema.keys())  # Convert to set for fast lookup

            ## Check for missing or extra columns
            missing_cols = all_schema - set(all_cols)
            extra_cols = set(all_cols) - all_schema

            if missing_cols or extra_cols:
                validation_status = False

            ## Write final status
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}\n")
                if missing_cols:
                    f.write(f"Missing Columns: {missing_cols}\n")
                if extra_cols:
                    f.write(f"Extra Columns: {extra_cols}\n")

            return validation_status

        except Exception as e:
            raise e


In [14]:
dv = DataValiadtion(data_validation_config)
dv.validate_all_columns()

True

In [2]:
from pathlib import Path
from pydantic import BaseModel

class DataTransformationConfig(BaseModel):

    ##config
    root_dir: Path
    data_path: Path
    ##params
    test_size: float
    random_state: int


In [18]:
from src.NetworkSecurity.constants import *
from src.NetworkSecurity.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self)->DataTransformationConfig:

        config = self.config.data_transformation
        params = self.params.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            test_size = params.test_size,
            random_state = params.random_state
        )

        return data_transformation_config

In [19]:
cm = ConfigurationManager()
data_transformation_config = cm.get_data_transformation_config()

[2025-12-09 20:27:39,744: INFO: common : Yaml File: config/config.yaml loaded successfully]
[2025-12-09 20:27:39,745: INFO: common : Yaml File: params.yaml loaded successfully]
[2025-12-09 20:27:39,750: INFO: common : Yaml File: schema.yaml loaded successfully]
[2025-12-09 20:27:39,751: INFO: common : created directory at: artifacts]
[2025-12-09 20:27:39,752: INFO: common : created directory at: artifacts/data_transformation]


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.NetworkSecurity.logging.logger import logger

# perform all EDA here
class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config = config
    
    def train_test_splitting(self):

        data = pd.read_csv(self.config.data_path)

        train,test = train_test_split(data,
                        test_size=self.config.test_size,
                        random_state=self.config.random_state
                        )

        train.to_csv(os.path.join(self.config.root_dir,"train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir,"test.csv"),index= False)

        logger.info("Splitted data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)


In [21]:
dt = DataTransformation(data_transformation_config)
dt.train_test_splitting()

[2025-12-09 20:28:41,042: INFO: 2371550360 : Splitted data into training and test sets]
[2025-12-09 20:28:41,043: INFO: 2371550360 : (8000, 49)]
[2025-12-09 20:28:41,043: INFO: 2371550360 : (2000, 49)]
(8000, 49)
(2000, 49)
