In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sms-spam-collection-dataset/__results__.html
/kaggle/input/sms-spam-collection-dataset/__output__.json
/kaggle/input/sms-spam-collection-dataset/custom.css
/kaggle/input/sms-spam-collection-dataset/__results___files/__results___20_0.png
/kaggle/input/sms-spam-collection-dataset/__results___files/__results___21_0.png
/kaggle/input/sms-spam-collection-dataset/__results___files/__results___12_0.png
/kaggle/input/sms-spam-collection-dataset/__results___files/__results___26_0.png
/kaggle/input/d/uciml/sms-spam-collection-dataset/spam.csv


In [16]:
df = pd.read_csv("/kaggle/input/d/uciml/sms-spam-collection-dataset/spam.csv", encoding="ISO-8859-1")

In [108]:
df.head()

Unnamed: 0,label,text,document_vector,y,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...","[0.036490303, 0.040876117, -0.016113281, 0.136...",0,jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,"[-0.11105347, 0.078430176, 0.049194336, 0.0715...",0,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[-0.01581616, -0.08514913, -0.07563612, -0.021...",1,free entry wkly comp win fa cup final tkts tex...
3,ham,U dun say so early hor... U c already then say...,"[-0.055345323, 0.047302246, 0.11828274, 0.1034...",0,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...","[0.10498047, 0.062219795, 0.04943293, 0.140547...",0,nah think go usf live


In [107]:
df = df.rename(columns={"v1": "label", "v2": "text"})

In [21]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [22]:
df.isna().sum(axis=0)

label    0
text     0
dtype: int64

There are no empty cells.

In [104]:
# Load the spacy small english model
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

In [105]:
df["processed_text"] = df["text"].apply(preprocess)

In [23]:
!pip install gensim



In [83]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")

In [124]:
def generate_vector(text, model=model):
    vectors = []
    for word in text.split():
        if word in model:
            vectors.append(model[word])
    if len(vectors):
        return sum(vectors) / len(vectors)
    else:
        return np.zeros(model.vector_size)

In [125]:
df["document_vector"] = df["processed_text"].apply(generate_vector)

In [126]:
df.head()

Unnamed: 0,label,text,document_vector,y,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...","[-0.02151724, 0.03513747, 0.016329251, 0.22137...",0,jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,"[-0.08577728, 0.057678223, 0.04466756, 0.11112...",0,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[-0.01022288, -0.02841797, -0.08932292, 0.0584...",1,free entry wkly comp win fa cup final tkts tex...
3,ham,U dun say so early hor... U c already then say...,"[-0.08528646, 0.078938805, 0.11419678, 0.09057...",0,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...","[-0.044311523, 0.0456604, 0.039697267, 0.20253...",0,nah think go usf live


In [127]:
def one_hot_encoding(row):
    if row["label"] == "ham":
        return 0
    else:
        return 1

In [128]:
df["y"] = df.apply(one_hot_encoding, axis=1)

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [129]:
X = np.vstack(df["document_vector"].to_numpy())
y = df["y"].to_numpy()

In [130]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [131]:
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

In [132]:
y_pred = logmodel.predict(X_test)

In [120]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [133]:
print("Accuracy score: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred))

Accuracy score:  0.9381165919282511
Precision:  0.8188976377952756
Recall:  0.6933333333333334
F1 score:  0.7509025270758124
Confusion matrix:  [[942  23]
 [ 46 104]]


In [134]:
def predict_message_class(model, w2v_model, message):
    processed_message = preprocess(message)
    message_vector = generate_vector(message, w2v_model)
    result = model.predict(message_vector)
    if result == 0:
        print("The sms is not spam.")
    else:
        print("The message is spam")

In [None]:
message = "URGENT! you have one 1 million dollars! Click on the below link to claim!"
predict_message_class(logmodel, model, message)