In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import spacy
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

nlp = spacy.load('en_core_web_sm')
random_state = 17

# **Data Loading**

In [None]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv')
df.rename(columns={'v1': 'label', 'v2': 'text'}, inplace=True)
df.label= df.label.map({'ham': 0, 'spam': 1})

# **X and Y creation**

In [None]:
X = df['text']
X = X.apply(lambda x: x.lower())
y = df['label']

# **Feature Extaction with spaCy**

In [None]:
# Stop words cleaning

X_stop = X.apply(lambda x: [i for i in nlp(x) if not i.is_stop])

In [None]:
# Sentence length

X_len = X_stop.apply(lambda x: len(x))
max_length = int(X_len.quantile(0.9))
X_stop = X_stop.apply(lambda x: x[:max_length])

In [None]:
# Lemmatization

X_lemma = X_stop.apply(lambda x: [i.lemma_ for i in x])
X_lemma = X_lemma.apply(lambda x: ' '.join([str(i) for i in x]))

In [None]:
# Fine-grained Part of Speech extraction

X_tag = X_lemma.apply(lambda x: [i.tag_ for i in nlp(x)])
X_tag = X_tag.apply(lambda x: ' '.join(x))

In [None]:
# Entity recognition

X_ents = X_lemma.apply(lambda x: [i.label_ for i in nlp(x).ents])
X_ents = X_ents.apply(lambda x: ' '.join(x))

# **Train & Test splitting**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_lemma, y, test_size=0.25, random_state=random_state)

In [None]:
X_train_ents, X_test_ents = X_ents[X_train.index], X_ents[X_test.index]
X_train_tag, X_test_tag = X_tag[X_train.index], X_tag[X_test.index]

# **Vectorizer**

# **TF-IDF on text**

In [None]:
text_tfidf = TfidfVectorizer(max_features=1000)

text_tfidf.fit(X_train)
X_train_tfidf = text_tfidf.transform(X_train)
X_test_tfidf = text_tfidf.transform(X_test)


transformer_tfidf = MinMaxScaler()

transformer_tfidf.fit(X_train_tfidf.toarray())
X_train_tfidf = transformer_tfidf.transform(X_train_tfidf.toarray())
X_test_tfidf = transformer_tfidf.transform(X_test_tfidf.toarray())


X_train_tfidf = pd.DataFrame(
    X_train_tfidf, 
    columns=[i.upper() for i in text_tfidf.get_feature_names()]
)
X_test_tfidf = pd.DataFrame(
    X_test_tfidf, 
    columns=[i.upper() for i in text_tfidf.get_feature_names()]
)

# **Count Vectorizer on text**

In [None]:
text_vect = CountVectorizer(max_features=1000)

text_vect.fit(X_train)
X_train_counts = text_vect.transform(X_train)
X_test_counts = text_vect.transform(X_test)


transformer_text = MinMaxScaler()

transformer_text.fit(X_train_counts.toarray())
X_train_counts = transformer_text.transform(X_train_counts.toarray())
X_test_counts = transformer_text.transform(X_test_counts.toarray())


X_train_counts = pd.DataFrame(
    X_train_counts, 
    columns=[i.upper() for i in text_vect.get_feature_names()]
)
X_test_counts = pd.DataFrame(
    X_test_counts, 
    columns=[i.upper() for i in text_vect.get_feature_names()]
)

# **Count Vectorizer on entities**

In [None]:
ent_vect = CountVectorizer()

ent_vect.fit(X_train_ents)
X_train_ents_counts = ent_vect.transform(X_train_ents)
X_test_ents_counts = ent_vect.transform(X_test_ents)


transformer_ents = MinMaxScaler()

transformer_ents.fit(X_train_ents_counts.toarray())
X_train_ents_counts = transformer_ents.transform(X_train_ents_counts.toarray())
X_test_ents_counts = transformer_ents.transform(X_test_ents_counts.toarray())


X_train_ents_counts = pd.DataFrame(
    X_train_ents_counts, 
    columns=[i.upper() for i in ent_vect.get_feature_names()]
)
X_test_ents_counts = pd.DataFrame(
    X_test_ents_counts, 
    columns=[i.upper() for i in ent_vect.get_feature_names()]
)

# **Count Vectorizer on Fine-grained POS**

In [None]:
tag_vect = CountVectorizer()

tag_vect.fit(X_train_tag)
X_train_tag_counts = tag_vect.transform(X_train_tag)
X_test_tag_counts = tag_vect.transform(X_test_tag)


transformer_tag = MinMaxScaler()

transformer_tag.fit(X_train_tag_counts.toarray())
X_train_tag_counts = transformer_tag.transform(X_train_tag_counts.toarray())
X_test_tag_counts = transformer_tag.transform(X_test_tag_counts.toarray())


X_train_tag_counts = pd.DataFrame(
    X_train_tag_counts, 
    columns=[i.upper() for i in tag_vect.get_feature_names()]
)
X_test_tag_counts = pd.DataFrame(
    X_test_tag_counts, 
    columns=[i.upper() for i in tag_vect.get_feature_names()]
)

# **Modeling**

In [None]:
X_train_joined = X_train_tfidf.join(X_train_tag_counts, rsuffix='_tag')
X_test_joined = X_test_tfidf.join(X_test_tag_counts, rsuffix='_tag')

In [None]:
# Train the model
clf = LinearSVC(random_state=random_state)
clf.fit(X_train_joined, y_train)

# Make the predictions
predictions = clf.predict(X_test_joined)

# Check the metrics
accuracy = metrics.accuracy_score(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
precision = metrics.precision_score(y_test, predictions)
f1 = metrics.f1_score(y_test, predictions)
print(f'Accuracy: {round(accuracy, 3):{8}} \
        \nRecall: {round(recall, 3):{10}} \
        \nPrecision: {round(precision, 3):{7}} \
        \nF1: {round(f1, 3):{14}}')

In [None]:
pd.DataFrame(metrics.confusion_matrix(y_test, predictions), columns=['ham', 'spam'], index=['ham', 'spam'])