In [14]:
import pandas as pd
import os 
import sys

In [15]:
os.getcwd()

'd:\\Model_Deployement\\End_to_End_Phishing_mail_detection_ML_project'

In [4]:
os.chdir('../')

In [16]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class Data_preprocessing_config:
    root_dir: Path
    unzip_data_dir: Path
    vector_embed_model: Path
    cleaned_data: Path
    vectorized_data: Path


In [17]:
from Phising_mail_detection.constants import *
from Phising_mail_detection.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_preprocessing_config(self) -> Data_preprocessing_config:
        config = self.config.data_preprocessing
        # schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = Data_preprocessing_config(
            root_dir=config.root_dir,
            unzip_data_dir = config.unzip_data_dir,
            vector_embed_model= config.vector_embed_model,
            cleaned_data = config.cleaned_data,
            vectorized_data = config.vectorized_data
            
        )

        return data_validation_config


In [18]:
from Phising_mail_detection import logger
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from Phising_mail_detection.utils.common import load_model,save_model
import pandas as pd
from gensim.models import Word2Vec

ps=PorterStemmer()

class Data_preprocessing_Validation:
    def __init__(self, config: Data_preprocessing_config):
        self.config = config
    

    def clean(self,data):
        data = re.sub(r'[^a-zA-Z\s]', '', data)
        data = data.lower()
        stop_words = set(stopwords.words('english'))
        words = data.split()
        words = [word for word in words if word not in stop_words]
        words = [ps.stem(word) for word in words]
        return words  # returns a list of tokens
    
    # Function to get average vector for an email
    def get_email_vector(self,tokens, model):
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(model.vector_size)

    def clean_and_Vector_embed(self):
        data=pd.read_csv(self.config.unzip_data_dir)
        data['Email Text'].fillna('',inplace=True)
        # print(data.isna().sum())
        # print("First row",data['Email Text'][0])
        # sample=data['Email Text'][0]
        # out=self.clean(sample)
        # Apply the clean function on the email text feature 
        # if os.path.exists(self.config.cleaned_data):
        #     logger.info(f"Cleaned data already exists at {self.config.cleaned_data}")
        #     data = pd.read_csv(self.config.cleaned_data)
        # else:
        #     logger.info("Cleaned data file not exists")
        data['Email Type'].replace({'Safe Email':0,'Phishing Email':1},inplace=True)
        data['tokens'] = data['Email Text'].apply(self.clean)
        # data.to_csv(self.config.cleaned_data, index=False)
        print(data['tokens'].iloc[0])  # Show the first cleaned email text
        # apply the word2vec for embedding purpose
        # check the word_to_vec model exists other wise train the word2vec model and use it
       

        if os.path.exists(self.config.vector_embed_model):
            logger.info(f"Vector embedding model exists at {self.config.vector_embed_model}")
            model=load_model(Path(self.config.vector_embed_model))

        else:
            logger.info("Model not exists - training the word2vec model on the email dataset")
            w2v_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)
            save_model(w2v_model,self.config.vector_embed_model)
            model=load_model(Path(self.config.vector_embed_model))
        
        print(data['tokens'][0])
        example=data['tokens'][0]
        vec=self.get_email_vector(example,model)
        print("Vector is ",vec)

        # Apply the vectorization to each email
        data['Email_vector'] = data['tokens'].apply(lambda x: self.get_email_vector(x, model))
        data.to_csv(self.config.vectorized_data)
        # Show the first email's vector
        print(data['Email_vector'].iloc[0])
        


    


In [19]:
try:
    config= ConfigurationManager()
    config_path_data= config.get_data_preprocessing_config()
    Data_preprocessing_main=Data_preprocessing_Validation(config=config_path_data)
    Data_preprocessing_main.clean_and_Vector_embed()
except Exception as e:
    raise e

[2025-08-04 13:11:30,375:'INFO':common:yaml file: config\config.yaml loaded successfully]
[2025-08-04 13:11:30,382:'INFO':common:yaml file: params.yaml loaded successfully]
[2025-08-04 13:11:30,382:'INFO':common:yaml file: schema.yaml loaded successfully]
[2025-08-04 13:11:30,382:'INFO':common:created directory at: artifacts]
[2025-08-04 13:11:30,394:'INFO':common:created directory at: artifacts/data_preprocessed]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Email Text'].fillna('',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Email Type'].replace({'Safe Email':0,'Phishing Email':1},inplace=True)
  data['Email Type'].replace({'Safe Email':0,'Phishing Email':1},inplace=True)


['disc', 'uniformitarian', 'sex', 'lang', 'dick', 'hudson', 'observ', 'us', 'use', 'aughter', 'voc', 'thoughtprovok', 'sure', 'fair', 'attribut', 'son', 'treat', 'like', 'senior', 'rel', 'one', 'thing', 'nt', 'normal', 'use', 'brother', 'way', 'aughter', 'hard', 'imagin', 'natur', 'class', 'compris', 'senior', 'rel', 'exclud', 'brother', 'anoth', 'seem', 'differ', 'imagin', 'distinct', 'seem', 'senior', 'rel', 'term', 'use', 'wider', 'varieti', 'context', 'e', 'g', 'call', 'distanc', 'get', 'someon', 'attent', 'henc', 'begin', 'utter', 'wherea', 'seem', 'natur', 'utter', 'like', 'ye', 'son', 'hand', 'son', 'one', 'like', 'son', 'son', 'help', 'although', 'perhap', 'latter', 'one', 'complet', 'imposs', 'alexi', 'mr']
[2025-08-04 13:13:11,666:'INFO':3959936807:Vector embedding model exists at artifacts/data_preprocessed/word_vec_model.pkl]
['disc', 'uniformitarian', 'sex', 'lang', 'dick', 'hudson', 'observ', 'us', 'use', 'aughter', 'voc', 'thoughtprovok', 'sure', 'fair', 'attribut', 'son