In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load the dataset
df = pd.read_csv('/content/spam mail.csv')

In [4]:
df.head()

Unnamed: 0,Category,Masseges
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.describe()

Unnamed: 0,Category,Masseges
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
#  Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())

# Dropping rows with missing values (if any)
df.dropna(inplace=True)


Missing values in the dataset:
Category    0
Masseges    0
dtype: int64


In [7]:
# Check again for missing values to confirm removal
print("\nMissing values after dropping:")
print(df.isnull().sum())


Missing values after dropping:
Category    0
Masseges    0
dtype: int64


In [8]:
# Split the dataset into features (X) and labels (y)
X = df['Masseges']  # Update with your text column name
y = df['Category']    # Update with your label column name

# Check for NaN values in labels
print("\nChecking for NaN values in labels:")
print(y.isnull().sum())


Checking for NaN values in labels:
0


In [9]:
# Check for NaN values after mapping
print("\nNaN values in labels after mapping:")
print(y.isnull().sum())

# Drop any rows where y is NaN
df = df[~y.isnull()]


NaN values in labels after mapping:
0


In [10]:
df.columns

Index(['Category', 'Masseges'], dtype='object')

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
# Convert text to numerical data
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [13]:
# Convert labels to binary values (0 for ham, 1 for spam)
y = y.map({'ham': 0, 'spam': 1})  # Adjust mapping according to your dataset

In [14]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [15]:
# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

In [16]:
# Train the model
model.fit(X_train, y_train)

In [17]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [18]:
# Calculate accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion)

# Classification report
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)


Accuracy: 0.98
Confusion Matrix:
[[964   1]
 [ 24 126]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [19]:
## Apply HyperParamter

In [20]:
# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers to use
    'penalty': ['l1', 'l2', 'elasticnet', 'none']  # Regularization type
}


In [21]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5)

In [22]:
# Fit the model
grid_search.fit(X_train, y_train)


In [23]:
# Print best hyperparameters
print(f"Best Hyperparameters: {grid_search.best_params_}")


Best Hyperparameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}


In [24]:
# Make predictions with the best estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion)

# Classification report
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)

Accuracy: 0.98
Confusion Matrix:
[[963   2]
 [ 21 129]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.86      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [25]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
dt_model = DecisionTreeClassifier()

# Fit the model
dt_model.fit(X_train, y_train)

# Make predictions
dt_y_pred = dt_model.predict(X_test)

# Evaluate the model
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print(f'Decision Tree Accuracy: {dt_accuracy:.2f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, dt_y_pred))
print('Classification Report:')
print(classification_report(y_test, dt_y_pred))

Decision Tree Accuracy: 0.97
Confusion Matrix:
[[952  13]
 [ 16 134]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       965
           1       0.91      0.89      0.90       150

    accuracy                           0.97      1115
   macro avg       0.95      0.94      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [26]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf_model = RandomForestClassifier()

# Fit the model
rf_model.fit(X_train, y_train)

# Make predictions
rf_y_pred = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f'Random Forest Accuracy: {rf_accuracy:.2f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, rf_y_pred))
print('Classification Report:')
print(classification_report(y_test, rf_y_pred))


Random Forest Accuracy: 0.98
Confusion Matrix:
[[965   0]
 [ 26 124]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [27]:
from sklearn.svm import SVC

# Initialize the model
svm_model = SVC()

# Fit the model
svm_model.fit(X_train, y_train)

# Make predictions
svm_y_pred = svm_model.predict(X_test)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print(f'SVM Accuracy: {svm_accuracy:.2f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, svm_y_pred))
print('Classification Report:')
print(classification_report(y_test, svm_y_pred))


SVM Accuracy: 0.98
Confusion Matrix:
[[965   0]
 [ 24 126]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [28]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the model
nb_model = MultinomialNB()

# Fit the model
nb_model.fit(X_train, y_train)

# Make predictions
nb_y_pred = nb_model.predict(X_test)

# Evaluate the model
nb_accuracy = accuracy_score(y_test, nb_y_pred)
print(f'Naive Bayes Accuracy: {nb_accuracy:.2f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, nb_y_pred))
print('Classification Report:')
print(classification_report(y_test, nb_y_pred))

Naive Bayes Accuracy: 0.98
Confusion Matrix:
[[952  13]
 [ 11 139]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.91      0.93      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [37]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=3)  # You can adjust n_neighbors as needed

# Fit the model
knn_model.fit(X_train, y_train)

# Make predictions
knn_y_pred = knn_model.predict(X_test)

# Evaluate the model
knn_accuracy = accuracy_score(y_test, knn_y_pred)
print(f'KNN Accuracy: {knn_accuracy:.2f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, knn_y_pred))
print('Classification Report:')
print(classification_report(y_test, knn_y_pred))


KNN Accuracy: 0.93
Confusion Matrix:
[[965   0]
 [ 79  71]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       965
           1       1.00      0.47      0.64       150

    accuracy                           0.93      1115
   macro avg       0.96      0.74      0.80      1115
weighted avg       0.93      0.93      0.92      1115



In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
from sklearn.svm import SVC

# Use TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_vectorized = tfidf_vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for SVM
svm_params = {
    'C': [0.1, 1, 10, 100],         # Regularization parameter
    'kernel': ['linear', 'rbf'],    # Kernel type
    'gamma': ['scale', 'auto']      # Kernel coefficient
}

# Create a GridSearchCV object
svm_grid = GridSearchCV(SVC(), svm_params, cv=5)
svm_grid.fit(X_train, y_train)

# Get the best model
svm_best_model = svm_grid.best_estimator_

# Make predictions
svm_y_pred = svm_best_model.predict(X_test)

# Evaluate SVM
print("SVM with Hyperparameter Tuning:")
print(f'Accuracy: {accuracy_score(y_test, svm_y_pred):.2f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, svm_y_pred))
print('Classification Report:')
print(classification_report(y_test, svm_y_pred))


SVM with Hyperparameter Tuning:
Accuracy: 0.98
Confusion Matrix:
[[961   4]
 [ 23 127]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.85      0.90       150

    accuracy                           0.98      1115
   macro avg       0.97      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

