In [None]:
!pip install ktrain

In [None]:
import numpy as np 
import pandas as pd 
import re
import ktrain
from ktrain import text
from nltk.corpus import stopwords

In [None]:
STOPWORDS = stopwords.words('english')
TESTSIZE = 0.3

## Load data

In [None]:
PATH = '/kaggle/input/spam-text-message-classification/'
TRAIN = 'SPAM text message 20170820 - Data.csv'
data = pd.read_csv(PATH+TRAIN)
print(data.shape)
data.head()

## Preprocessing

In [None]:
def cleaner(sentence):
    sentence = str(sentence).lower()
    sentence = re.sub(r"[^a-zA-Z ]+",'',sentence)
    sentence = re.sub(r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+','',sentence)
    cleaned = ''''''
    for word in sentence.split():
        if word not in STOPWORDS:
            cleaned = "{} {}".format(cleaned, word)
    return cleaned

def df_cleaner(df, column):
    df.replace(np.nan,"", regex=True,inplace=True)
    df[column] = df[column].apply(lambda x: cleaner(x))
    return df

In [None]:
data = df_cleaner(data, "Message")
data.head()

In [None]:
data = pd.concat([data, data.Category.astype('str').str.get_dummies()], axis=1, sort=False)
data = data[['Message','ham','spam']]
data.head()

In [None]:
%%time
n = data.shape[0]*TESTSIZE
train = data.loc[:n]
test  = data.loc[n:]

## Training

In [None]:
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(train, 
                                                                   'Message', # name of column containing review text
                                                                   label_columns=['ham','spam'],
                                                                   maxlen=75, 
                                                                   max_features=100000,
                                                                   preprocess_mode='bert',
                                                                   val_pct=0.1)

In [None]:
model = text.text_classifier('bert', (x_train, y_train) , preproc=preproc)
learner = ktrain.get_learner(model, 
                             train_data=(x_train, y_train), 
                             val_data=(x_test, y_test), 
                             batch_size=32)

In [None]:
learner.fit_onecycle(2e-5, 2)

## Make predictions

In [None]:
p = ktrain.get_predictor(learner.model, preproc)

In [None]:
test['predicted_value'] = test['Message'].apply(lambda x: p.predict(x))

In [None]:
test