In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')
df.dropna(how="any", inplace=True, axis=1)
df.columns = ['label', 'message']
df.head()

In [None]:
df.groupby('label').describe()

In [None]:
sns.countplot(df.label)

In [None]:
df['label'] = df.label.map({'ham':0, 'spam':1})
df

In [None]:
df['message_len'] = df.message.apply(len)
df

In [None]:
import string
from nltk.corpus import stopwords

In [None]:
stopwords.words('english')

In [None]:
# remove stopwords
# remove punctuation
def text_preprocessing(message):
    sw = stopwords.words('english') + ['u', 'ur', 'im', 'dont', 'doin', 'ure']
    
    pt = [token for token in message if token not in string.punctuation]
    pt = ''.join(pt)
    
    return ' '.join([word for word in pt.split() if word.lower() not in sw])

In [None]:
df['message_clean_new'] = df.message.apply(text_preprocessing)

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [None]:
X = df.message_clean_new
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer()),
    ('algo', LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=42))
])

In [None]:
parameter = {
    'algo__C' : [3.9],
    'algo__fit_intercept' : [True]
}

In [None]:
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=50, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

In [None]:
print(model.score(X_train, y_train)), print(model.score(X_test, y_test))