In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score , classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv("SMSSpamCollection", sep='\t', names = ['label', 'text'])
!pip install nltk

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
swords = stopwords.words('english')
ps = PorterStemmer()

In [None]:
sent = 'Hello friends! How are you? We will be learning Python today. is the'
def clean_text(sent):
    tokens = word_tokenize(sent)
    clean = [word for word in tokens
             if word.isdigit() or word.isalpha()]
    clean = [ps.stem(word) for word in clean if word not in swords]
    return (clean)
clean_text(sent)

In [None]:
tfidf = TfidfVectorizer(analyzer = clean_text)
x = df['text']
y = df['label']
x_new = tfidf.fit_transform(x)
tfidf.get_feature_names()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_new, y, random_state = 0, test_size=0.25)

In [None]:
nb = GaussianNB()
nb.fit(x_train.toarray(), y_train)
y_pred = nb.predict(x_test.toarray())
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

In [None]:
rf = RandomForestClassifier(random_state = 0)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
print(classification_report(y_test, y_pred))

In [78]:
log = LogisticRegression()
log.fit(x_train, y_train)
y_pred = log.predict(x_test)
accuracy_score(y_test, y_pred)

0.9641062455132807

In [77]:
params = {
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2'],
    'random_state': [0, 1, 2, 3, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

In [76]:
grid = GridSearchCV(rf, param_grid = params, cv= 5, scoring = 'accuracy')
y_pred = rf.predict(x_test)
accuracy_score(y_test, y_pred)

0.9834888729361091