In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
import joblib

In [2]:
pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp310-cp310-win_amd64.whl.metadata (14 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.0-cp310-cp310-win_amd64.whl (10.7 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.7.0 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
df = pd.read_csv('Cleaned_Text_Dataset.csv')

In [3]:
df

Unnamed: 0,label,message
0,spam,"Free entry in contest, apply now Congratulatio..."
1,ham,"Hey, are we meeting tomorrow? I'll call you in..."
2,spam,Get rich quick with this scheme Click here to ...
3,ham,Are you free this weekend? I'll call you in 10...
4,spam,Win a free iPhone now Win a free iPhone now
5,ham,Lets have lunch together Meeting scheduled at ...
6,spam,Get rich quick with this scheme Win a free iPh...
7,ham,Meeting scheduled at 5 PM Lets have lunch toge...
8,spam,Click here to claim your prize Get rich quick ...
9,ham,"Hey, are we meeting tomorrow? Lets have lunch ..."


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
df['cleaned_message']=df['message'].apply(clean_text)

In [7]:
df

Unnamed: 0,label,message,cleaned_message
0,spam,"Free entry in contest, apply now Congratulatio...",free entry in contest apply now congratulation...
1,ham,"Hey, are we meeting tomorrow? I'll call you in...",hey are we meeting tomorrow ill call you in mi...
2,spam,Get rich quick with this scheme Click here to ...,get rich quick with this scheme click here to ...
3,ham,Are you free this weekend? I'll call you in 10...,are you free this weekend ill call you in minutes
4,spam,Win a free iPhone now Win a free iPhone now,win a free iphone now win a free iphone now
5,ham,Lets have lunch together Meeting scheduled at ...,lets have lunch together meeting scheduled at pm
6,spam,Get rich quick with this scheme Win a free iPh...,get rich quick with this scheme win a free iph...
7,ham,Meeting scheduled at 5 PM Lets have lunch toge...,meeting scheduled at pm lets have lunch together
8,spam,Click here to claim your prize Get rich quick ...,click here to claim your prize get rich quick ...
9,ham,"Hey, are we meeting tomorrow? Lets have lunch ...",hey are we meeting tomorrow lets have lunch to...


In [8]:
df['label_name'] = df['label'].map({'ham':0,'spam':1})

In [9]:
df

Unnamed: 0,label,message,cleaned_message,label_name
0,spam,"Free entry in contest, apply now Congratulatio...",free entry in contest apply now congratulation...,1
1,ham,"Hey, are we meeting tomorrow? I'll call you in...",hey are we meeting tomorrow ill call you in mi...,0
2,spam,Get rich quick with this scheme Click here to ...,get rich quick with this scheme click here to ...,1
3,ham,Are you free this weekend? I'll call you in 10...,are you free this weekend ill call you in minutes,0
4,spam,Win a free iPhone now Win a free iPhone now,win a free iphone now win a free iphone now,1
5,ham,Lets have lunch together Meeting scheduled at ...,lets have lunch together meeting scheduled at pm,0
6,spam,Get rich quick with this scheme Win a free iPh...,get rich quick with this scheme win a free iph...,1
7,ham,Meeting scheduled at 5 PM Lets have lunch toge...,meeting scheduled at pm lets have lunch together,0
8,spam,Click here to claim your prize Get rich quick ...,click here to claim your prize get rich quick ...,1
9,ham,"Hey, are we meeting tomorrow? Lets have lunch ...",hey are we meeting tomorrow lets have lunch to...,0


In [10]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df['cleaned_message'])

In [12]:
x.toarray()

array([[0.34212304, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.34212304, 0.34212304, 0.34212304, 0.22622161,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.254447  , 0.        , 0.        , 0.34212304, 0.        ,
        0.        , 0.        , 0.254447  , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.34212304, 0.        , 0.        ,
        0.34212304],
       [0.        , 0.28945075, 0.        , 0.33084554, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.33084554, 0.33084554,
        0.28945075, 0.        , 0.        , 0.        , 0.        ,
        0.25734246, 0.33084554, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.     

In [13]:
y = df['label_name']

In [14]:
y

0    1
1    0
2    1
3    0
4    1
5    0
6    1
7    0
8    1
9    0
Name: label_name, dtype: int64

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,random_state=42)

In [16]:
model = LogisticRegression()
model.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [18]:
y_pred = model.predict(x_test)

In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [21]:
confusion_matrix(y_test,y_pred)

array([[1, 0],
       [0, 1]], dtype=int64)

In [22]:
joblib.dump(model,'logistics_spam.pkl')

['logistics_spam.pkl']

In [23]:
joblib.dump(vectorizer,"tfidf_spam.pkl")

['tfidf_spam.pkl']

In [24]:
load_model_classification = joblib.load('logistics_spam.pkl')

In [25]:
load_model_vec = joblib.load('tfidf_spam.pkl')

In [26]:
sample_text = ['congratulations you won a free trip','lets meet at 5 pm ist for discussion']

In [27]:
sample_cleand = [clean_text(i) for i in sample_text]

In [28]:
sample_cleand

['congratulations you won a free trip', 'lets meet at pm ist for discussion']

In [29]:
sample_vec = load_model_vec.transform(sample_cleand)

In [31]:
sample_vec.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.56255473, 0.        , 0.        , 0.3719774 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.56255473, 0.47822292, 0.        ,
        0.        ],
       [0.        , 0.        , 0.6013393 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.52610083, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.6013393 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.     

In [32]:
load_model_classification.predict(sample_vec)

array([1, 0], dtype=int64)