In [1]:
import os
import sys
import nltk 


try:
    notebook_path = os.path.abspath('') 
except NameError:
    notebook_path = os.getcwd() 

project_root = os.path.abspath(os.path.join(notebook_path, '..'))

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Added to sys.path: {project_root}")
print("Current sys.path:")
for p in sys.path:
    print(f"  {p}")

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from src.preprocess import clean_text

if 'clean_text' in locals() and callable(clean_text):
    print("clean_text is callable and loaded successfully.")
else:
    print("Error: clean_text is not defined or not callable after import.")

Added to sys.path: c:\Users\asus\OneDrive\Desktop\projects\spam_classifier
Current sys.path:
  c:\Users\asus\OneDrive\Desktop\projects\spam_classifier
  c:\Users\asus\anaconda3\python313.zip
  c:\Users\asus\anaconda3\DLLs
  c:\Users\asus\anaconda3\Lib
  c:\Users\asus\anaconda3
  
  c:\Users\asus\anaconda3\Lib\site-packages
  c:\Users\asus\anaconda3\Lib\site-packages\win32
  c:\Users\asus\anaconda3\Lib\site-packages\win32\lib
  c:\Users\asus\anaconda3\Lib\site-packages\Pythonwin
clean_text is callable and loaded successfully.


In [None]:


models_dir_path = os.path.join(project_root, 'models')

print(f"Attempting to save model to: {models_dir_path}")

if os.path.exists(models_dir_path):
    print(f"Directory '{models_dir_path}' exists.")
    if os.path.isdir(models_dir_path):
        print(f"'{models_dir_path}' is a directory.")
        # Try creating a dummy file to test write permissions
        try:
            test_file_path = os.path.join(models_dir_path, 'test_write.txt')
            with open(test_file_path, 'w') as f:
                f.write("Test.")
            print(f"Successfully wrote to '{test_file_path}'. Write permissions are OK.")
            os.remove(test_file_path) # Clean up
        except Exception as e:
            print(f"Error writing to directory: {e}. Permissions might be an issue.")
    else:
        print(f"ERROR: '{models_dir_path}' exists but is NOT a directory.")
else:
    print(f"ERROR: Directory '{models_dir_path}' does NOT exist.")

Attempting to save model to: c:\Users\asus\OneDrive\Desktop\projects\spam_classifier\models
Directory 'c:\Users\asus\OneDrive\Desktop\projects\spam_classifier\models' exists.
'c:\Users\asus\OneDrive\Desktop\projects\spam_classifier\models' is a directory.
Successfully wrote to 'c:\Users\asus\OneDrive\Desktop\projects\spam_classifier\models\test_write.txt'. Write permissions are OK.


In [3]:
import pandas as pd
import nltk
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from src.preprocess import clean_text
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline 

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_csv('../data/spam.csv', sep='\t', header=None, names=['label', 'message'])

In [6]:
df['cleaned_message'] = df['message'].apply(clean_text)

In [7]:
#labels to numeric format
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_message'], df['label'], test_size=0.2, random_state=42)

print("Original message example:", df['message'][2])
print("Cleaned message example:", df['cleaned_message'][2])

Original message example: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
Cleaned message example: free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
count_vectorizer = CountVectorizer()
X_train_cv = count_vectorizer.fit_transform(X_train)
X_test_cv = count_vectorizer.transform(X_test)


In [11]:
from imblearn.over_sampling import SMOTE
sm_cv = SMOTE(random_state=42)
X_train_cv_res, y_train_cv_res = sm_cv.fit_resample(X_train_cv, y_train)
print(f"CV Resampled distribution: {pd.Series(y_train_cv_res).value_counts()}")


CV Resampled distribution: label
1    3859
0    3859
Name: count, dtype: int64


In [12]:
#tfidfvecorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [13]:
sm_tfidf = SMOTE(random_state=42)
X_train_tfidf_res, y_train_tfidf_res = sm_tfidf.fit_resample(X_train_tfidf, y_train)
print(f"TFIDF Resampled distribution: {pd.Series(y_train_tfidf_res).value_counts()}")


TFIDF Resampled distribution: label
1    3859
0    3859
Name: count, dtype: int64


In [14]:
print("Shape of X_train_tfidf:", X_train_tfidf.shape)

Shape of X_train_tfidf: (4457, 7142)


In [15]:
from src.train import train_model, evaluate_model, save_model
import os
import joblib

In [16]:
import matplotlib.pyplot as plt

In [17]:
#naive bayes with countvectorizer
nb_cv_model = train_model(X_train_cv_res, y_train_cv_res, model_name="MultinomialNB")
evaluate_model(nb_cv_model, X_test_cv, y_test, model_name="Naive Bayes (CountVectorizer)")
save_model(nb_cv_model, os.path.join(project_root, 'models', 'nb_cv_model.pkl'))


--- Naive Bayes (CountVectorizer) Evaluation ---
Accuracy: 0.9695
Precision: 0.8662
Recall: 0.9128
F1 Score: 0.8889
Confusion Matrix:
 [[945  21]
 [ 13 136]]
Model saved to c:\Users\asus\OneDrive\Desktop\projects\spam_classifier\models\nb_cv_model.pkl


In [18]:
#logistic regression with count vectotizer
lr_cv_model = train_model(X_train_cv_res, y_train_cv_res, model_name="LogisticRegression")
evaluate_model(lr_cv_model, X_test_cv, y_test, model_name="Logistic Regression (CountVectorizer)")
save_model(lr_cv_model, os.path.join(project_root, 'models', 'lr_cv_model.pkl'))



--- Logistic Regression (CountVectorizer) Evaluation ---
Accuracy: 0.9211
Precision: 0.6393
Recall: 0.9396
F1 Score: 0.7609
Confusion Matrix:
 [[887  79]
 [  9 140]]
Model saved to c:\Users\asus\OneDrive\Desktop\projects\spam_classifier\models\lr_cv_model.pkl


In [19]:
#naive bayes with tfidf
nb_tfidf_model = train_model(X_train_tfidf_res, y_train_tfidf_res, model_name="MultinomialNB")
evaluate_model(nb_tfidf_model, X_test_tfidf, y_test, model_name="Naive Bayes (TfidfVectorizer)")
save_model(nb_tfidf_model, os.path.join(project_root, 'models', 'nb_tfidf_model.pkl'))



--- Naive Bayes (TfidfVectorizer) Evaluation ---
Accuracy: 0.9740
Precision: 0.8704
Recall: 0.9463
F1 Score: 0.9068
Confusion Matrix:
 [[945  21]
 [  8 141]]
Model saved to c:\Users\asus\OneDrive\Desktop\projects\spam_classifier\models\nb_tfidf_model.pkl


In [20]:
#logistic regression with tfidf
lr_tfidf_model = train_model(X_train_tfidf_res, y_train_tfidf_res, model_name="LogisticRegression")
evaluate_model(lr_tfidf_model, X_test_tfidf, y_test, model_name="Logistic Regression (TfidfVectorizer)")
save_model(lr_tfidf_model, os.path.join(project_root, 'models', 'lr_tfidf_model.pkl'))



--- Logistic Regression (TfidfVectorizer) Evaluation ---
Accuracy: 0.9803
Precision: 0.9320
Recall: 0.9195
F1 Score: 0.9257
Confusion Matrix:
 [[956  10]
 [ 12 137]]
Model saved to c:\Users\asus\OneDrive\Desktop\projects\spam_classifier\models\lr_tfidf_model.pkl


In [21]:
#from sklearn.pipeline import Pipeline
#from sklearn.model_selection import GridSearchCV
#from sklearn.feature_extraction.text import TfidfVectorizer # Included for clarity of pipeline definition
#from sklearn.linear_model import LogisticRegression
#import joblib
#import os

In [22]:
# print("Starting Hyperparameter Tuning with GridSearchCV (using imblearn.Pipeline and SMOTE)...")

# # --- Define the Pipeline (using imblearn.pipeline.Pipeline) ---
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer()), # Step 1: Text to numerical features
#     ('smote', SMOTE(random_state=42)), # Step 2: Handle imbalance (applied on numerical data)
#     ('lr', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)) # Step 3: Classification model
# ])

In [23]:
# param_grid = {
#     'tfidf__max_features': [5000, 10000, None],
#     'tfidf__ngram_range': [(1, 1), (1, 2)],
#     'tfidf__min_df': [1, 5, 10],
#     'tfidf__max_df': [1.0, 0.9, 0.7],

#     'smote__sampling_strategy': ['minority', 0.5, 1.0],
#     'smote__k_neighbors': [3, 5, 7],

#     'lr__C': [0.1, 1, 10, 100],
#     'lr__solver': ['liblinear', 'saga'],
# }


In [24]:
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=-1, scoring='f1')
# grid_search.fit(X_train, y_train)

In [25]:
# print("\nBest parameters found:")
# print(grid_search.best_params_)


In [26]:
# print("\nBest F1-Score (from cross-validation):")
# print(grid_search.best_score_)


In [27]:
# best_model_pipeline = grid_search.best_estimator_


In [28]:
from sklearn.metrics import classification_report, confusion_matrix



In [29]:
# y_pred = best_model.predict(X_test)
# print("\nClassification Report on Test Set (Best Model):")
# print(classification_report(y_test, y_pred))

In [31]:
# print("\nConfusion Matrix on Test Set (Best Model):")
# print(confusion_matrix(y_test, y_pred))


In [32]:
joblib.dump(count_vectorizer, os.path.join(project_root, 'models', 'count_vectorizer.pkl'))
joblib.dump(tfidf_vectorizer, os.path.join(project_root, 'models', 'tfidf_vectorizer.pkl')) 

['c:\\Users\\asus\\OneDrive\\Desktop\\projects\\spam_classifier\\models\\tfidf_vectorizer.pkl']

In [33]:
# joblib.dump(best_model_pipeline, os.path.join(project_root, 'models', 'best_lr_tfidf_smote_pipeline.pkl'))
# print(f"Best model pipeline saved to: {os.path.join(project_root, 'models', 'best_lr_tfidf_smote_pipeline.pkl')}")


In [34]:
import pandas as pd
def show_top_words(model,vectorizer,top_n=20):
    if not hasattr(model,'coef_'):
        print("model does not have 'coef_' attribute.this is for logistic regression")
        return
    feature_names = vectorizer.get_feature_names_out()
    coefs = model.coef_[0]
    
    word_coef_df = pd.DataFrame({'word': feature_names, 'coefficient': coefs})

    # Sort for spam
    top_spam_words = word_coef_df.sort_values(by='coefficient', ascending=False).head(top_n)
    print(f"\nTop {top_n} words contributing to SPAM:")
    print(top_spam_words)
    
    #sort for ham
    top_ham_words = word_coef_df.sort_values(by='coefficient', ascending=True).head(top_n)
    print(f"\nTop {top_n} words contributing to HAM:")
    print(top_ham_words)
    
# with Logistic Regression (CountVectorizer)
print("\n--- Top words for Logistic Regression (CountVectorizer) ---")
show_top_words(lr_cv_model, count_vectorizer)

# with Logistic Regression (TfidfVectorizer)
#print("\n--- Top words for Logistic Regression (TfidfVectorizer) ---")
#show_top_words(lr_tfidf_model, tfidf_vectorizer)


--- Top words for Logistic Regression (CountVectorizer) ---

Top 20 words contributing to SPAM:
         word  coefficient
1775    claim     2.202755
5535   servic     1.916296
341        18     1.764362
6465      txt     1.762666
4225    mobil     1.682302
6349     tone     1.526371
5939     stop     1.524494
5034    prize     1.520516
5267    repli     1.484065
5324  rington     1.452480
2748  freemsg     1.425752
6005      sue     1.412743
1896  contact     1.365754
1642     cash     1.321188
4406      new     1.308708
6182     text     1.240417
6490       uk     1.224227
1571     call     1.223478
1926     cost     1.194051
302       150     1.163459

Top 20 words contributing to HAM:
       word  coefficient
4587     ok    -2.965973
3953   ltgt    -2.954375
5663    sir    -2.606108
3337    ill    -2.413801
3763  later    -2.327995
2019     da    -2.317441
3340     im    -2.272857
4305   much    -2.215519
6741    wat    -2.185886
1839   come    -2.158885
5929  still    -2.122354
3