In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import joblib
import os

def preprocess_data():
    # Load data
    df = pd.read_csv('SMSSpamCollection.csv', sep='\t', names=['label', 'message'])

    # Remove punctuation 
    df['message'] = df['message'].str.replace('[^\w\s]','')

    # Initialize vectorizer
    tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

    # Vectorize text messages 
    X = tfidf.fit_transform(df['message']).toarray()

    # Encode target labels
    le = LabelEncoder()
    y = le.fit_transform(df['label']) 

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Oversample minority class
    oversampler = SMOTE()
    X_train, y_train = oversampler.fit_resample(X_train, y_train)

    # Ensure data folder exists
    os.makedirs('data', exist_ok=True)

    # Save preprocessed data
    pd.DataFrame(X_train).to_csv('data/X_train.csv', index=False)
    pd.DataFrame(X_test).to_csv('data/X_test.csv', index=False)
    pd.DataFrame(y_train).to_csv('data/y_train.csv', index=False)
    pd.DataFrame(y_test).to_csv('data/y_test.csv', index=False)

    # Ensure models folder exists
    os.makedirs('models', exist_ok=True)

    # Save the tfidf vectorizer and label encoder
    joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')
    joblib.dump(le, 'models/label_encoder.pkl')

if __name__ == "__main__":
    preprocess_data()
