In [1]:
import pandas as pd

# Load the dataset
url = ("spam.csv")  # Replace with the actual path to the CSV file
df = pd.read_csv(url,encoding = "latin-1", header=None, names=['label', 'message', 'Unnamed_2', 'Unnamed_3', 'Unnamed_4'])

# Drop unnecessary columns
df = df[['label', 'message']]

# Display the first few rows to understand its structure
print(df.head())


  label                                            message
0    v1                                                 v2
1   ham  Go until jurong point, crazy.. Available only ...
2   ham                      Ok lar... Joking wif u oni...
3  spam  Free entry in 2 a wkly comp to win FA Cup fina...
4   ham  U dun say so early hor... U c already then say...


In [2]:
df

Unnamed: 0,label,message
0,v1,v2
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...
5569,ham,Will Ì_ b going to esplanade fr home?
5570,ham,"Pity, * was in mood for that. So...any other s..."
5571,ham,The guy did some bitching but I acted like i'd...


In [3]:
# Check for missing values
print(df.isnull().sum())

# Check the distribution of labels
print(df['label'].value_counts())


label      0
message    0
dtype: int64
label
ham     4825
spam     747
v1         1
Name: count, dtype: int64


In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import nltk

# Download stopwords from NLTK if not already downloaded
nltk.download('stopwords')

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(f'[{string.punctuation}]', ' ', text)  # Remove punctuation
    text = text.split()  # Tokenize
    text = [word for word in text if word not in stop_words]  # Remove stopwords
    text = [stemmer.stem(word) for word in text]  # Stemming
    return ' '.join(text)

# Apply preprocessing to the messages
df['clean_message'] = df['message'].apply(preprocess_text)

# Check the cleaned text
print(df['clean_message'].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0                                                    v
1    go jurong point crazi avail bugi n great world...
2                                ok lar joke wif u oni
3    free entri wkli comp win fa cup final tkt st m...
4                  u dun say earli hor u c alreadi say
Name: clean_message, dtype: object


In [5]:
df['clean_message']

0                                                       v
1       go jurong point crazi avail bugi n great world...
2                                   ok lar joke wif u oni
3       free entri wkli comp win fa cup final tkt st m...
4                     u dun say earli hor u c alreadi say
                              ...                        
5568    nd time tri contact u u å£ pound prize claim e...
5569                              ì b go esplanad fr home
5570                                    piti mood suggest
5571    guy bitch act like interest buy someth els nex...
5572                                       rofl true name
Name: clean_message, Length: 5573, dtype: object

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Transform the messages to TF-IDF features
X = vectorizer.fit_transform(df['clean_message']).toarray()

# Convert labels to binary (0 for ham, 1 for spam)
y = df['label'].apply(lambda x: 1 if x == 'spam' else 0).values

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))


Accuracy: 0.97
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       964
        spam       0.98      0.82      0.90       151

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [8]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define parameter distributions for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=10), param_distributions=param_dist, n_iter=5, cv=5, scoring='accuracy', random_state=10)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print best parameters and score
print(f'Best parameters: {random_search.best_params_}')
print(f'Best cross-validation score: {random_search.best_score_:.2f}')


Best parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}
Best cross-validation score: 0.98


In [10]:
best_model = random_search.best_estimator_


In [11]:
y_pred_best = best_model.predict(X_test)


In [12]:
from sklearn.metrics import classification_report, accuracy_score

final_accuracy = accuracy_score(y_test, y_pred_best)
final_report = classification_report(y_test, y_pred_best, target_names=['ham', 'spam'])

print(f'Final Test Accuracy: {final_accuracy:.2f}')
print(final_report)


Final Test Accuracy: 0.98
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       964
        spam       0.98      0.86      0.92       151

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

