In [9]:
# Cell 1: Import Libraries
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report



In [2]:
import pandas as pd

df = pd.read_csv('cleaned_data.csv', encoding='latin1')
df = df.drop_duplicates()

In [3]:
df.to_csv('cleaned_data.csv', index=False)


In [6]:
# Cell 2: Prepare Data
import pandas as pd

# Load dataset
data = pd.read_csv('cleaned_data.csv')

# Check class distribution
print(data['Message'].value_counts())


Message
Rofl. Its true to its name                                                                                                                                          1
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                                     1
Ok lar... Joking wif u oni...                                                                                                                                       1
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's         1
U dun say so early hor... U c already then say...                                                                                                                   1
                                                                                                                                                                  

In [7]:
# Cell 3
print(data.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [8]:
# Cell 4
print(data.columns)

Index(['Category', 'Message'], dtype='object')


In [9]:
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True, errors='ignore')


In [10]:
import pickle

# Load the vectorizer
cv = pickle.load(open('vector123.pkl', 'rb'))

# Check the vocabulary (list of features/words)
print(cv.get_feature_names_out()[:20])  # Display first 20 features


['00' '000' '000pes' '008704050406' '0089' '0121' '01223585236'
 '01223585334' '0125698789' '02' '0207' '02072069400' '02073162414'
 '02085076972' '021' '03' '04' '0430' '05' '050703']


In [11]:
import re

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation, numbers, etc.
    text = re.sub(r'\W', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Example text
user_input = "Hi friend, how are you? I hope you are doing good."
processed_text = preprocess_text(user_input)



In [16]:
# Cell 7
# Rename columns for clarity
data.rename(columns={'v1': 'Label', 'v2': 'Text'}, inplace=True)


In [12]:
# Map 'ham' to 0 and 'spam' to 1
data['Message'] = data['Message'].map({'ham': 0, 'spam': 1})


In [13]:
# Display dataset information
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5157 entries, 0 to 5156
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  5157 non-null   object 
 1   Message   0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 80.7+ KB
None


In [14]:
# Check for null values
print(data.isnull().sum())


Category       0
Message     5157
dtype: int64


In [15]:
X = data['Category']  # Feature: text messages
y = data['Message']  # Label: spam (1) or ham (0)



In [16]:
print(data['Message'].value_counts())


Series([], Name: count, dtype: int64)


In [18]:
from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer

# Convert text data to feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)  # X is your text data


In [19]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [20]:
import numpy as np

# Remove rows with NaN in y_train
X_train = X_train[~y_train.isnull()]
y_train = y_train.dropna()


In [21]:
y_train = y_train.fillna(0)  # Fill NaN with 0 (or choose another value)


In [22]:
import numpy as np

# Convert sparse matrix to dense
X_train_dense = X_train.toarray()

# Check for NaNs
print(np.isnan(X_train_dense).sum())  # Number of NaNs in X_train


0


In [23]:
print(y_train.isnull().sum())  # Check for NaNs in y_train


0


In [24]:
# Check for NaN in y_train (whether it's a pandas Series or ndarray)
if hasattr(y_train, 'isnull'):
    print(y_train.isnull().sum())  # For pandas Series
else:
    print(np.isnan(y_train).sum())  # For ndarray (or numpy array)


0


In [25]:
# Assuming y_train is a pandas Series
y_train = y_train.dropna()

# Ensure X_train is also filtered in the same way
X_train = X_train[y_train.index]


In [26]:
y_train = y_train.fillna(0)  # or 'non-spam' depending on your use case


In [27]:
X_train = X_train[y_train.index]  # Ensures X_train and y_train match after handling NaNs


In [28]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (0, 2)
X_test shape: (1032, 2)
y_train shape: (0,)
y_test shape: (1032,)


In [29]:
print(f"Dataset shape: {X.shape}, {y.shape}")

Dataset shape: (5157, 2), (5157,)


In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
print(X[:5])  # Show first 5 samples
print(y[:5])  # Show first 5 labels

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 5 stored elements and shape (5, 2)>
  Coords	Values
  (0, 0)	1
  (1, 0)	1
  (2, 1)	1
  (3, 0)	1
  (4, 0)	1
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: Message, dtype: float64


In [36]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle

# Load the dataset
df = pd.read_csv('cleaned_data.csv')
print(df.head())  # Debugging: Check the first few rows

# Ensure no missing values
df.dropna(inplace=True)

# Preprocessing
X = df['Message']  # The text data (features)
y = df['Category']  # The labels (target variable)

# Vectorization
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(X)  # Apply vectorization to the message text
print(X.shape)  # Debugging: Ensure non-zero rows and columns

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)  # Debugging: Ensure valid training data

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)  # Should now work without errors

# Evaluate the Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Save the Model and Vectorizer
pickle.dump(model, open('spam_model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
(5157, 8476)
(4125, 8476) (4125,)
Model Accuracy: 97.29%


In [37]:
# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

In [38]:
# Make predictions
y_pred = model.predict(X_test)


In [39]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9728682170542635

Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.98      0.98       896
        spam       0.86      0.96      0.90       136

    accuracy                           0.97      1032
   macro avg       0.92      0.97      0.94      1032
weighted avg       0.98      0.97      0.97      1032



In [41]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Load the dataset with proper encoding to avoid character issues
df = pd.read_csv('cleaned_data.csv', encoding='utf-8')

# Display the first few rows for verification
print(df.head())

# Ensure no missing values
df.dropna(inplace=True)

# Preprocess the text by converting to lowercase and removing non-alphabetic characters
df['Message'] = df['Message'].str.lower().str.replace(r'[^a-z\s]', '', regex=True)

# Map 'ham' to 0 and 'spam' to 1
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

# Separate features (X) and labels (y)
X = df['Message']
y = df['Category']

# Vectorization (Converting text to numerical features)
vectorizer = CountVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(X)

# Display the shape of the vectorized data
print(f"Feature matrix shape: {X.shape}")

# Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training data shape: {X_train.shape}, Test data shape: {X_test.shape}")

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the trained model and vectorizer
with open('spam_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('vectorizer.pkl', 'wb') as vector_file:
    pickle.dump(vectorizer, vector_file)

print("Model and vectorizer saved successfully!")


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
Feature matrix shape: (5157, 5000)
Training data shape: (4125, 5000), Test data shape: (1032, 5000)
Model Accuracy: 97.97%

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       896
           1       0.91      0.94      0.92       136

    accuracy                           0.98      1032
   macro avg       0.95      0.96      0.96      1032
weighted avg       0.98      0.98      0.98      1032

Model and vectorizer saved successfully!


In [42]:
# Save the trained model and vectorizer for future use
import pickle
with open('spam_detector.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)


In [43]:
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)



In [44]:
from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(X_test)))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       896
           1       0.91      0.94      0.92       136

    accuracy                           0.98      1032
   macro avg       0.95      0.96      0.96      1032
weighted avg       0.98      0.98      0.98      1032



In [45]:
# Import necessary libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Step 1: Create an imbalanced dataset
X, y = make_classification(
    n_classes=2,
    class_sep=2,
    weights=[0.9, 0.1],
    n_informative=3,
    n_redundant=1,
    flip_y=0,
    n_features=5,
    n_clusters_per_class=1,
    n_samples=1000,
    random_state=42
)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 2: Handle Imbalanced Data with SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Step 3: Train the classifier (with and without balanced class weights)
clf_no_weight = RandomForestClassifier(random_state=42)
clf_with_weight = RandomForestClassifier(class_weight='balanced', random_state=42)

# Train models
clf_no_weight.fit(X_train, y_train)
clf_with_weight.fit(X_train_balanced, y_train_balanced)

# Step 4: Predict and evaluate the model
y_pred_no_weight = clf_no_weight.predict(X_test)
y_pred_with_weight = clf_with_weight.predict(X_test)

# Step 5: Evaluate metrics (with zero_division)
print("Without Handling Class Weights:")
print(classification_report(y_test, y_pred_no_weight, zero_division=0))

print("\nWith Balanced Class Weights and SMOTE:")
print(classification_report(y_test, y_pred_with_weight, zero_division=0))


Without Handling Class Weights:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       270
           1       0.97      1.00      0.98        30

    accuracy                           1.00       300
   macro avg       0.98      1.00      0.99       300
weighted avg       1.00      1.00      1.00       300


With Balanced Class Weights and SMOTE:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       270
           1       0.97      1.00      0.98        30

    accuracy                           1.00       300
   macro avg       0.98      1.00      0.99       300
weighted avg       1.00      1.00      1.00       300



In [79]:
print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!
