In [98]:
import pandas as pd 
import numpy as np 
import spacy 
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


In [99]:
df = pd.read_csv('../Data/spam.csv',encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [100]:
df = df.iloc[:,0:2]

In [101]:
df.columns = ['label','text']
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [102]:
nlp = spacy.load('en_core_web_sm')
tqdm.pandas(desc='Processing with spaCy')
spacy_results = df['text'].progress_map(nlp)

Processing with spaCy: 100%|██████████| 5572/5572 [00:29<00:00, 186.72it/s]


In [103]:
# Encode with Sentence Transformers

sentence_bert = SentenceTransformer('paraphrase-distilroberta-base-v1')
# tqdm.pandas(desc='Applying sentence-bert')
# vectors = df['text'].progress_map(model.encode)


In [104]:
import swifter

%time vectors_swifter = df['text'].swifter.apply(sentence_bert.encode)

Pandas Apply:   0%|          | 0/5572 [00:00<?, ?it/s]

CPU times: user 12min 25s, sys: 6.08 s, total: 12min 31s
Wall time: 2min 36s


In [105]:
df['raw_spacy'] = spacy_results
df['raw_pos'] = df['raw_spacy'].swifter.apply(lambda x: ' '.join([t.pos_ for t in x]))

Pandas Apply:   0%|          | 0/5572 [00:00<?, ?it/s]

In [106]:
df['sentence-bert'] = vectors_swifter
df.head()

Unnamed: 0,label,text,raw_spacy,raw_pos,sentence-bert
0,ham,"Go until jurong point, crazy.. Available only ...","(Go, until, jurong, point, ,, crazy, .., Avail...",VERB ADP ADJ NOUN PUNCT ADJ PUNCT ADJ ADV ADP ...,"[0.076579936, -0.3930265, 0.27844715, 0.371942..."
1,ham,Ok lar... Joking wif u oni...,"(Ok, lar, ..., Joking, wif, u, oni, ...)",INTJ ADJ PUNCT NOUN VERB NOUN ADV PUNCT,"[0.022812596, 0.17678502, 0.12619068, -0.65074..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"(Free, entry, in, 2, a, wkly, comp, to, win, F...",ADJ NOUN ADP NUM DET ADJ NOUN PART VERB PROPN ...,"[0.15409197, 0.06857502, -0.13811308, -0.40663..."
3,ham,U dun say so early hor... U c already then say...,"(U, dun, say, so, early, hor, ..., U, c, alrea...",NOUN NOUN VERB ADV ADJ NOUN PUNCT NOUN AUX ADV...,"[0.09308915, -0.12710004, -0.033977684, -0.630..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","(Nah, I, do, n't, think, he, goes, to, usf, ,,...",PROPN PRON AUX PART VERB PRON VERB ADP NOUN PU...,"[-0.036661543, 0.19233567, -0.27760535, 0.3999..."


In [107]:
df['label'] = df.label.swifter.apply(lambda x : 1 if x =='spam' else 0)
df.head()

Pandas Apply:   0%|          | 0/5572 [00:00<?, ?it/s]

Unnamed: 0,label,text,raw_spacy,raw_pos,sentence-bert
0,0,"Go until jurong point, crazy.. Available only ...","(Go, until, jurong, point, ,, crazy, .., Avail...",VERB ADP ADJ NOUN PUNCT ADJ PUNCT ADJ ADV ADP ...,"[0.076579936, -0.3930265, 0.27844715, 0.371942..."
1,0,Ok lar... Joking wif u oni...,"(Ok, lar, ..., Joking, wif, u, oni, ...)",INTJ ADJ PUNCT NOUN VERB NOUN ADV PUNCT,"[0.022812596, 0.17678502, 0.12619068, -0.65074..."
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"(Free, entry, in, 2, a, wkly, comp, to, win, F...",ADJ NOUN ADP NUM DET ADJ NOUN PART VERB PROPN ...,"[0.15409197, 0.06857502, -0.13811308, -0.40663..."
3,0,U dun say so early hor... U c already then say...,"(U, dun, say, so, early, hor, ..., U, c, alrea...",NOUN NOUN VERB ADV ADJ NOUN PUNCT NOUN AUX ADV...,"[0.09308915, -0.12710004, -0.033977684, -0.630..."
4,0,"Nah I don't think he goes to usf, he lives aro...","(Nah, I, do, n't, think, he, goes, to, usf, ,,...",PROPN PRON AUX PART VERB PRON VERB ADP NOUN PU...,"[-0.036661543, 0.19233567, -0.27760535, 0.3999..."


In [108]:
df.drop(columns=['raw_spacy'],inplace=True)

In [171]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier



In [289]:
train_df = df.copy()

In [290]:
def stack_embeddings(embeddings):
    import numpy as np
    return np.vstack(embeddings.values)

ct = ColumnTransformer([
    ('bag of ngrams', TfidfVectorizer(ngram_range=(1, 2), max_features=3000), 'text'),
    ('bag of POS', CountVectorizer(ngram_range=(1, 2)), 'raw_pos'),
    # Lambda functions cannot be pickled
    ('sentence bert', FunctionTransformer(stack_embeddings), 'sentence-bert'),
    # ('bag of NER types', CountVectorizer(ngram_range=(1, 2)), 'raw_ner'),
    # ('ngrams before', TfidfVectorizer(ngram_range=(1, 2), max_features=3000), 'raw_before'),
    # ('ngrams after', TfidfVectorizer(ngram_range=(1, 2), max_features=3000), 'raw_after')    
],remainder='passthrough')

# lm = LogisticRegression()
xgb = XGBClassifier(random_state=0)

# pipeline = Pipeline([('transformer', ct), ('classifier', lm)])
pipeline = Pipeline([('transformer', ct), ('classifier', xgb)])


y,X = train_df.pop('label'),train_df

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42,stratify=y)

In [291]:


%time model = pipeline.fit(X_train, y_train)

CPU times: user 53.3 s, sys: 354 ms, total: 53.7 s
Wall time: 5.52 s


In [292]:
y_pred = model.predict(X_test)

In [293]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1448
           1       0.99      0.91      0.95       224

    accuracy                           0.99      1672
   macro avg       0.99      0.95      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [294]:
# import joblib
# filename = 'model.sav'
# joblib.dump(model, filename)
import dill


pkl_filename = "../Models/model.pkl"
with open(pkl_filename, 'wb') as file:
    dill.dump(model, file)

In [295]:
ls ../Models

[0m[01;32mmodel.pkl[0m*


In [296]:
with open(pkl_filename,'rb') as file:
    loaded_model = dill.load(file)

In [297]:
def make_inference_df(input_text):

    model_input_dict = {}
    input_row_list = []
    

    spacy_raw = nlp(input_text)
    # pos_tags = [t.pos_ for t in spacy_raw]

    model_input_dict['text'] = input_text
    model_input_dict['raw_pos'] =  ' '.join([t.pos_ for t in spacy_raw])
    model_input_dict['sentence-bert'] = sentence_bert.encode(input_text)

    input_row_list.append(model_input_dict)

    model_input_df = pd.DataFrame(input_row_list)
    return model_input_df

In [298]:
sample_text = 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C\'s'
# make_inference_df(sample_text)
loaded_model.predict(make_inference_df(sample_text))

array([1])

In [299]:
sample_text_2 = 'Nah I don\'t think he goes to usf, he lives around here though'
print(sample_text_2)
loaded_model.predict(make_inference_df(sample_text))

Nah I don't think he goes to usf, he lives around here though


array([1])