In [17]:
import pandas as pd
import numpy as np
import logging
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Tuple
from typing_extensions import Annotated
import pickle


In [30]:
class PreProcessStrategy:
    def clean_data(self,df:pd.DataFrame)->pd.DataFrame:
        try:
            def indepth_preprocess(text):
                text=text.lower()
                text=re.sub(r'(?i)^subject:\s*','',text)#case insensittive aand only removes the header line
                return text
            df['text']=df['text'].apply(indepth_preprocess)
            dataframe=df
            return dataframe
        except Exception as e:
            logging.error(f'Errinng in cleaning the data from DataFrame {e}')
            raise e
        
    def vectorize_train_and_split(self,dataframe:pd.DataFrame)->Tuple[
        Annotated[np.ndarray,"X_Train"],
        Annotated[np.ndarray,"X_Test"],
        Annotated[pd.DataFrame,"Y_Train"],
        Annotated[pd.DataFrame,"Y_Test"]
        ]:
        try:
            X=dataframe['text']
            Y=dataframe['spam']
            X_Train_df,X_Test_df,Y_Train,Y_Test=train_test_split(X,Y,test_size=0.2,random_state=2)
            tfid=TfidfVectorizer(max_features=5000,ngram_range=(1,2))
            X_Train=tfid.fit_transform(X_Train_df)
            X_Test=tfid.transform(X_Test_df)
            with open('/opt/airflow/config/tfidvectorizer.pkl','wb') as f:
                pickle.dump(tfid,f)
                print('dumped the data successfully in config folder')
            return X_Train,X_Test,Y_Train,Y_Test
        except Exception as e:
            logging.error(f'Errinng in cleaning the data from DataFrame {e}')
            raise e
    
    def clense_production_data(self,text:str):
        pass

In [26]:
df=pd.read_csv('/Users/rishilboddula/Desktop/MLOPS/E-mail-Spam-MLOPS-/dataset/emails.csv')
pre_process_strategy=PreProcessStrategy()
cleaning_data=pre_process_strategy.clean_data(df)

In [27]:
vector=pre_process_strategy.vectorize_train_and_split(cleaning_data)

In [28]:
vector

(<4582x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 669997 stored elements in Compressed Sparse Row format>,
 <1146x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 169542 stored elements in Compressed Sparse Row format>,
 5274    0
 4022    0
 2223    0
 1205    1
 1667    0
        ..
 1099    1
 2514    0
 3606    0
 5704    0
 2575    0
 Name: spam, Length: 4582, dtype: int64,
 2246    0
 1252    1
 1177    1
 3772    0
 938     1
        ..
 974     1
 5711    0
 2522    0
 1939    0
 2213    0
 Name: spam, Length: 1146, dtype: int64)