In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/spam.csv", encoding='ISO-8859-1')
df.columns = ['class', 'sms', 'unk1', 'unk2', 'unk3']
del df['unk1']
del df['unk2']
del df['unk3']
df.head()

Unnamed: 0,class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.drop_duplicates(inplace=True)
df.groupby('class').size()

class
ham     4516
spam     653
dtype: int64

In [4]:
653/(653+4516)

0.12633004449603405

In [5]:
df['label'] = df['class'].map({'ham': 0, 'spam': 1})

In [6]:
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(df['sms'], df['label'], test_size=0.2, random_state=42)

In [8]:
X_train.shape

(4135,)

In [9]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [12]:
pred = model.predict(X_test_tfidf)

In [13]:
from sklearn.metrics import accuracy_score, recall_score

In [14]:
accuracy_score(pred, y_test)

0.9642166344294004

In [15]:
recall_score(pred, y_test)

0.9821428571428571

In [23]:
import re
from nltk.corpus import stopwords

In [24]:
def preprocess_text(text):
    
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = text.lower()  # Convert to lowercase
    tokens = text.split()  # Tokenize by splitting
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return tokens

X_train_tokens = X_train.apply(preprocess_text)
X_test_tokens = X_test.apply(preprocess_text)

In [29]:
from gensim.models import FastText
from gensim.models import KeyedVectors

In [28]:
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
# !gunzip cc.en.300.vec.gz


--2024-10-22 23:22:55--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 2600:9000:263e:ae00:13:6e38:acc0:93a1, 2600:9000:263e:b600:13:6e38:acc0:93a1, 2600:9000:263e:8400:13:6e38:acc0:93a1, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|2600:9000:263e:ae00:13:6e38:acc0:93a1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz’


2024-10-22 23:27:09 (4.99 MB/s) - ‘cc.en.300.vec.gz’ saved [1325960915/1325960915]



In [32]:
fasttext_model = KeyedVectors.load_word2vec_format('cc.en.300.vec', binary=False)
# print(fasttext_model['hello']) 

In [33]:
def get_sentence_embedding(tokens):
    embeddings = [fasttext_model[word] for word in tokens if word in fasttext_model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        # If no words are in the FastText vocabulary, return a zero vector
        return np.zeros(fasttext_model.vector_size)

In [35]:
import numpy as np

In [36]:
X_train_embeddings = np.vstack(X_train_tokens.apply(get_sentence_embedding).values)
X_test_embeddings = np.vstack(X_test_tokens.apply(get_sentence_embedding).values)

In [38]:
from xgboost import XGBClassifier

In [39]:
xgb_model = XGBClassifier(verbosity=0)
xgb_model.fit(X_train_embeddings, y_train)
y_pred = xgb_model.predict(X_test_embeddings)