In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset
df = pd.read_csv('spam mail.csv')

In [3]:
df.head()

Unnamed: 0,Category,Masseges
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.describe()

Unnamed: 0,Category,Masseges
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
#  Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())

# Dropping rows with missing values (if any)
df.dropna(inplace=True)


Missing values in the dataset:
Category    0
Masseges    0
dtype: int64


In [6]:
# Check again for missing values to confirm removal
print("\nMissing values after dropping:")
print(df.isnull().sum())


Missing values after dropping:
Category    0
Masseges    0
dtype: int64


In [7]:
# Split the dataset into features (X) and labels (y)
X = df['Masseges']  # Update with your text column name
y = df['Category']    # Update with your label column name

# Check for NaN values in labels
print("\nChecking for NaN values in labels:")
print(y.isnull().sum())


Checking for NaN values in labels:
0


In [8]:
# Check for NaN values after mapping
print("\nNaN values in labels after mapping:")
print(y.isnull().sum())

# Drop any rows where y is NaN
df = df[~y.isnull()]


NaN values in labels after mapping:
0


In [9]:
df.columns

Index(['Category', 'Masseges'], dtype='object')

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
# Convert text to numerical data
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [12]:
# Convert labels to binary values (0 for ham, 1 for spam)
y = y.map({'ham': 0, 'spam': 1})  # Adjust mapping according to your dataset

In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [14]:
# Create a logistic regression model
model = LogisticRegression(max_iter=1000)  

In [15]:
# Train the model
model.fit(X_train, y_train)

In [16]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [17]:
# Calculate accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion)

# Classification report
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)


Accuracy: 0.98
Confusion Matrix:
[[965   0]
 [ 24 126]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [18]:
## Apply HyperParamter

In [19]:
# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers to use
    'penalty': ['l1', 'l2', 'elasticnet', 'none']  # Regularization type
}


In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5)

In [None]:
# Fit the model
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Make predictions with the best estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion)

# Classification report
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)