<a href="https://colab.research.google.com/github/safakan/CCTB_repo_safak_atakan_celik/blob/main/DEA113/SpamClassificationExercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
import nltk

resources = ["punkt_tab", "stopwords", "wordnet"]
# vader_lexicon: SentimentIntensityAnalyzer()
for resource in resources:
    nltk.download(resource)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## TRAINING DATA

In [3]:
data = [
    # Non-Spam (label = 1)
    {"text": "Excited to join the French newsletter to improve my language skills!", "label": 1},
    {"text": "Please enroll me! Can’t wait to start learning French grammar.", "label": 1},
    {"text": "As a cctb_student, need this newsletter for my coursework.", "label": 1},
    {"text": "Looking forward to weekly French vocabulary lessons!", "label": 1},
    {"text": "Your newsletter will help me prepare for my French exam. Thank you!", "label": 1},
    {"text": "cctb_student here – please add me to the mailing list.", "label": 1},
    {"text": "I’m eager to practice French writing with your tips.", "label": 1},
    {"text": "Interested in learning conversational French for my trip to Paris.", "label": 1},
    {"text": "This is exactly what I need to stay motivated while studying French!", "label": 1},
    {"text": "cctb_student registration: I’d love cultural tips alongside lessons.", "label": 1},
    {"text": "Thrilled to enhance my pronunciation with your expert guidance.", "label": 1},
    {"text": "Count me in! I want daily French practice emails.", "label": 1},
    {"text": "I’ve heard amazing things about your French learning resources.", "label": 1},
    {"text": "As a beginner, structured lessons via email would be perfect.", "label": 1},
    {"text": "cctb_student – I want to improve my reading comprehension!", "label": 1},
    {"text": "cctb_student – what's happening in this french newsletter my dear", "label": 1},
    {"text": "I want to subscribe to the french newsletter to learn", "label": 1},
    {"text": "I want to join because I want to learn", "label": 1},
    {"text": "I want to be in, I plan to read and get exposed to french content", "label": 1},
    {"text": "Let me in I just want to learn and continue", "label": 1},
    {"text": "cctb_student it might be good to read some content in french sometime", "label": 1},
    {"text": "cctb_student love this woho french newsletter I want to participate and learn", "label": 1},
    {"text": "I loved the concept of the french newsletter, I heard it at cctb", "label": 1},
    {"text": "I feel so happy to be part of this", "label": 1},
    {"text": "read french expose content exposure cctb student", "label": 1},
    {"text": "cctb student reads french 101 class newsletter", "label": 1},

    # Spam (label = 0)
    {"text": "Free iPhone giveaway! Click here to claim instantly.", "label": 0},
    {"text": "This service is useless. Don’t bother signing up.", "label": 0},
    {"text": "Buy cheap watches online – best deals guaranteed!", "label": 0},
    {"text": "I hate French; why would anyone learn this?", "label": 0},
    {"text": "Sign me up.", "label": 0},
    {"text": "Earn $500 daily with this secret method!", "label": 0},
    {"text": "Your newsletter sucks. Stop spamming me.", "label": 0},
    {"text": "Random text to test the system. Ignore.", "label": 0},
    {"text": "Promote your business with our marketing services!", "label": 0},
    {"text": "I didn’t ask for this. Remove me now.", "label": 0},
    {"text": "Who even uses French anymore? Useless.", "label": 0},
    {"text": "Get discounted software licenses here!", "label": 0},
    {"text": "This is a scam; nobody should subscribe.", "label": 0},
    {"text": "Why is the registration so complicated?!", "label": 0},
    {"text": "I’m just here to post comments, not learn.", "label": 0},
    {"text": "cctb_student, I don't even know what is this", "label": 0},
    {"text": "cctb_student, let me in because I have my reasons", "label": 0},
    {"text": "!@#$%^&*()_+{}|:\"<>?", "label": 0},
    {"text": "~~~~~~~~~~", "label": 0},
    {"text": "//////??????//////", "label": 0},
    {"text": "(*&^%$#@!)", "label": 0},
    {"text": ".........,,,,,,,,,,", "label": 0},
    {"text": "###########@@@@@@@", "label": 0},
    {"text": "<<<<>>>>====?????", "label": 0},
    {"text": "I want to join", "label": 0},
    {"text": "I want to subscribe", "label": 0},
    {"text": "I want to be in", "label": 0},
    {"text": "Let me in", "label": 0}
]

df = pd.DataFrame(data)

## PREPROCESSING

In [4]:
# Tokenization
from nltk.tokenize import word_tokenize
df["tokens"] = df["text"].apply(lambda x: nltk.word_tokenize(x))


# Filtering tokens: removing stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df["filtered_tokens"] = df["tokens"].apply(lambda x: [word for word in x if word.lower() not in stop_words])


# Lemmatizing
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df["lemmatized_tokens"] = df["filtered_tokens"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Preprocessed text
df["preprocessed_text"] = df["lemmatized_tokens"].apply(lambda x: " ".join(x))


# Training data documents
documents = list(df[["preprocessed_text", "label"]].itertuples(index=False, name=None))

## TRAINING MODELS

In [5]:
# Model Training 1


# Prepare features and labels
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
features = count_vectorizer.fit_transform([doc[0] for doc in documents])
labels = [doc[1] for doc in documents]


# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
NB_classifier = MultinomialNB()
NB_classifier.fit(features, labels)

def predict_with_naive_bayes_classifier(text):
    X = count_vectorizer.transform([text])
    prediction = NB_classifier.predict(X)
    return prediction

In [6]:
# Model Training 2


# Prepare features and labels
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
features = tfidf_vectorizer.fit_transform([doc[0] for doc in documents])
labels = [doc[1] for doc in documents]


# Support Vector Classifier
from sklearn.svm import SVC
SVC_classifier = SVC()
SVC_classifier.fit(features, labels)

def predict_with_support_vector_classifier(text):
    X = tfidf_vectorizer.transform([text])
    prediction = SVC_classifier.predict(X.toarray())
    return prediction

In [7]:
# Model Training 3


# Prepare features and labels
from sklearn.feature_extraction.text import HashingVectorizer
hashing_vectorizer = HashingVectorizer()
features = hashing_vectorizer.fit_transform([doc[0] for doc in documents])
labels = [doc[1] for doc in documents]


# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
RF_classifier = RandomForestClassifier()
RF_classifier.fit(features, labels)

def predict_with_random_forest_classifier(text):
    X = hashing_vectorizer.transform([text])
    prediction = RF_classifier.predict(X)
    return prediction

---

## Spam Classification

Example Predictions

In [8]:
example_text = "cctb_student wants to subscribe to the french newsletter"
example_text = "cctb_student wants to subscribe to the french newsletter because wants to learn and read"

classifiers = [
    predict_with_naive_bayes_classifier,
    predict_with_support_vector_classifier,
    predict_with_random_forest_classifier
]

results = [classifier(example_text)[0] for classifier in classifiers]
print(1) if sum(results) >= 2 else print(0)

1


In [11]:
results

[1, 1, 1]

In [12]:
# Test with a new example
new_example = count_vectorizer.transform(["Free iPhone come and GetAT Ait now"])
prediction = NB_classifier.predict(new_example)
print(prediction)

[0]


In [9]:
# classifiers = [
#     predict_with_naive_bayes_classifier,
#     predict_with_support_vector_classifier,
#     predict_with_random_forest_classifier
# ]
# results = [classifier(example_text)[0] for classifier in classifiers]
# print(1) if sum(results) >= 2 else print(0)
# results


# # # features = count_vectorizer.fit_transform([doc[0] for doc in documents])
# # # labels = [doc[1] for doc in documents]

# # # documents
# # # [doc[0] for doc in documents]
# # # print(count_vectorizer.fit_transform([doc[0] for doc in documents]))
# # print(count_vectorizer.fit_transform(["This is a test, oh my lets go test, love in the air love", "oh lets go love"]))
# # print(count_vectorizer.fit_transform(["This is a test, oh my lets go test, love in the air love", "oh lets go love"]).toarray())
# # # features = count_vectorizer.fit_transform(["This is a test, oh my lets go test, love in the air love", "oh lets go love"])
# # # labels = [1, 0]
# # # NB_classifier2 = MultinomialNB()
# # # NB_classifier2.fit(features, labels)
# # # count_vectorizer.get_feature_names_out()


# # tfidf_vectorizer = TfidfVectorizer()
# # features = tfidf_vectorizer.fit_transform([doc[0] for doc in documents])
# # labels = [doc[1] for doc in documents]
# # print(tfidf_vectorizer.fit_transform(["This is a test, oh my lets go test, love in the air love", "oh lets go love"]).toarray())


# # X = tfidf_vectorizer.fit_transform([
# #     "This is a test, oh my lets go test, love in the air love",
# #     "oh lets go love",
# #     "did I just say love too many times to test tfidf?",
# #     "tfidf is wrong about the importance of the word love, but yea it checks with the method"
# # ])

# # tfidf_vectorizer2 = TfidfVectorizer(norm=None)
# tfidf_vectorizer3 = TfidfVectorizer()
# X = tfidf_vectorizer3.fit_transform([
#     "This is a test, friends",
#     "yes friends are lovely friends",
#     "same scores?"
# ],)

# # Get feature names (vocabulary)
# feature_names = tfidf_vectorizer3.get_feature_names_out()

# # Convert sparse matrix to dense array
# dense_array = X.toarray()

# df = pd.DataFrame(dense_array, columns=feature_names)

# # Print DataFrame
# print(df)

# #  tf-idf(t, d) = tf(t, d) * idf(t)
# ## checking the word "friends"
# ### 1 occurence in the 1st doc | tf(t, d) == 1
# ### idf(t) = log [ n / df(t) ] + 1
# #### n == 3
# #### df(t) == 2 for friends
# ####  log(3/2) + 1 = 0.17609125905 + 1 = 1.17609125905
# ##### tf-idf(t, d) = tf(t, d) * idf(t)
# ##### = 1 *1.17609125905
# ########

# # doc: 0, word: friends
# ## tf-idf = tf * idf
# ### tf == 1
# ### idf = log((num of docs + 1) / (docs containing the word + 1)) + 1 | as if there's one more document that contains the word
# ### idf = log(4/3) + 1
# #### tfidf for "friends" in doc 0: 1.1249387366

# ##### it s not base 10... base e.

# # TF IDF (default)
# ## 1) count matrix
# ## 2) ffidf transformer: tf * idf | + smoothing
# ## normalization | L2 by default

# # normalization:
# # "friends": 1.287682
# # "is": 1.693147
# # "test": 1.693147
# # "this": 1.693147
# ## 1 - euclidean norm: square root of the sum of squared TF-IDF values:
# ### square root of (  (1.287682)2+(1.693147)2+(1.693147)2+(1.693147)2​  )  --- approx. 3.2028
# ## 2- divide each tf-idf value with euclidean norm
# ### 1.287682 / 3.2028 = 0.4020488322717622
# ##### normalziations are done on the document level.



# cvectorizer0 = CountVectorizer()
# print(cvectorizer0.fit_transform(["Oh la la", "Oh my!"]).toarray())


# cvectorizer0.get_feature_names_out()


# print(cvectorizer0.fit_transform(["Oh la la", "Oh my!"]))

[[2 0 1]
 [0 1 1]]
