In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

df = pd.read_csv('mail_data.csv')


data = df.where((pd.notnull(df)), '')

data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})
X = data['Message']
Y = data['Category']


X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.2,
    random_state=3
)


feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


model = LogisticRegression()
model.fit(X_train_features, Y_train)


train_predictions = model.predict(X_train_features)
train_accuracy = accuracy_score(Y_train, train_predictions)
print("Training Accuracy:", train_accuracy)


test_predictions = model.predict(X_test_features)
test_accuracy = accuracy_score(Y_test, test_predictions)
print("Testing Accuracy:", test_accuracy)

joblib.dump(model, 'model.joblib')
joblib.dump(feature_extraction, 'vectorizer.joblib')

print("Model and Vectorizer saved successfully!")

Training Accuracy: 0.9676912721561588
Testing Accuracy: 0.9668161434977578
Model and Vectorizer saved successfully!
