In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


In [4]:
data = pd.read_csv('spam.csv', sep=',' ,encoding='latin-1')

In [5]:
data_copy = data

In [6]:
ham_only = data[data['v1'] == 'ham']
print("len of ham: " + str(len(ham_only)))
spam_only = data[data['v1'] == 'spam']
print("len of spam: " + str(len(spam_only)))
oversampling_factor = len(ham_only) / len(spam_only)
print(oversampling_factor)

len of ham: 4825
len of spam: 747
6.459170013386881


In [7]:
oversampled_spam = spam_only.sample(frac=1, replace=True,
random_state=42)
data_copy = pd.concat([data_copy, oversampled_spam])
print(data_copy['v1'].value_counts())

v1
ham     4825
spam    1494
Name: count, dtype: int64


In [8]:
num_spam = len(data_copy[data_copy['v1'] == 'spam'])
ham_only_sampled = data[data['v1'] =='ham'].sample(n=num_spam,random_state=42)

In [9]:
balanced_data = pd.concat([ham_only_sampled,
data_copy[data_copy['v1'] == 'spam']])
balanced_data = balanced_data.sample(frac=1, random_state=42)
print(balanced_data['v1'].value_counts())

v1
spam    1494
ham     1494
Name: count, dtype: int64


In [10]:
X = balanced_data['v2']
y = balanced_data['v1']

In [11]:
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [12]:
X_vectorized.shape

(2988, 5500)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X_vectorized, y,test_size=0.3, random_state=0)

In [14]:
model = LogisticRegression(C=1e20)
model.fit(x_train, y_train)
score1 = model.score(x_test, y_test)
score2 = model.score(x_train, y_train)
print("Score sur test-set=", score1 * 100, '%')
print("Score sur train-set =", score2 * 100,'%')

Score sur test-set= 98.21627647714605 %
Score sur train-set = 100.0 %


In [18]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


print("Accuracy:", accuracy)
print("Classification Report:",report)

Accuracy: 0.9821627647714605
Classification Report:               precision    recall  f1-score   support

         ham       0.98      0.98      0.98       459
        spam       0.98      0.98      0.98       438

    accuracy                           0.98       897
   macro avg       0.98      0.98      0.98       897
weighted avg       0.98      0.98      0.98       897



In [19]:
new_data = [" Cher client, voici votre facture mensuelle pour le service internet.",
            "Bonjour, j'espère que vous passez une excellente journée ! N'oubliez pas notre réunion prévue pour demain à 10h.",
            "REMINDER FROM O2: To get 2.50 pounds free call credit and details of great offers pls reply 2 this text with your valid name,house no and postcode",
            "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
new_data_vectorized = vectorizer.transform(new_data)
predictions = model.predict(new_data_vectorized)
print("Prédictions:", predictions)

Prédictions: ['ham' 'ham' 'spam' 'spam']
