In [3]:
import pandas as pd

# Load the training data
train_df = pd.read_csv('train.csv')

# Display the first few rows of the training dataset and summary information
train_df_info = train_df.info()
train_df_head = train_df.head()

train_df_info, train_df_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          4009 non-null   object
 1   spam          4009 non-null   int64 
 2   cleaned_text  4009 non-null   object
dtypes: int64(1), object(2)
memory usage: 94.1+ KB


(None,
                                                 text  spam  \
 0  Subject: volatility curves - linked from reute...     0   
 1  Subject: organizational announcement  fyi .  -...     0   
 2  Subject: lng meeting  hello all :  the lng mee...     0   
 3  Subject: fwd : re : optical network engineerin...     0   
 4  Subject: re : argentina modelling  michael ,  ...     0   
 
                                         cleaned_text  
 0  subject volatility curves  linked from reuters...  
 1  subject organizational announcement  fyi      ...  
 2  subject lng meeting  hello all   the lng meeti...  
 3  subject fwd  re  optical network engineering  ...  
 4  subject re  argentina modelling  michael   wha...  )

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the data
train_df = pd.read_csv('train.csv')
validation_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_df['cleaned_text'])
y_train = train_df['spam']
X_validation = tfidf_vectorizer.transform(validation_df['cleaned_text'])
y_validation = validation_df['spam']
X_test = tfidf_vectorizer.transform(test_df['cleaned_text'])
y_test = test_df['spam']

# Fit the initial model (Logistic Regression) on the training data
model = LogisticRegression()
model.fit(X_train, y_train)

# Score and evaluate the model on training and validation data
for dataset_name, X, y in [("Training", X_train, y_train), ("Validation", X_validation, y_validation)]:
    predictions = model.predict(X)
    accuracy = accuracy_score(y, predictions)
    report = classification_report(y, predictions)
    print(f"{dataset_name} Data - Accuracy: {accuracy}\n{report}\n")

# Fine-tuning could be done here based on validation performance

# Benchmark models
models = {
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier()
}

# Score benchmark models on test data
best_model_name = None
best_accuracy = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} - Test Data Accuracy: {accuracy}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = model_name

print(f"\nBest Model: {best_model_name} with accuracy {best_accuracy}")


Training Data - Accuracy: 0.9957595410326765
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3082
           1       1.00      0.98      0.99       927

    accuracy                           1.00      4009
   macro avg       1.00      0.99      0.99      4009
weighted avg       1.00      1.00      1.00      4009


Validation Data - Accuracy: 0.9674039580908033
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       645
           1       0.99      0.87      0.93       214

    accuracy                           0.97       859
   macro avg       0.98      0.94      0.95       859
weighted avg       0.97      0.97      0.97       859


Naive Bayes - Test Data Accuracy: 0.8511627906976744
SVM - Test Data Accuracy: 0.9906976744186047
Random Forest - Test Data Accuracy: 0.9476744186046512

Best Model: SVM with accuracy 0.9906976744186047


In [5]:
import joblib
joblib.dump(models, 'best_model.pkl')

['best_model.pkl']