In [7]:
import numpy as np 
import pandas as pd 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# import some data to play with
data = pd.read_csv('input_data/spam.csv', encoding='latin-1')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [11]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [13]:
data= data[['v2','v1']]
data.head()

Unnamed: 0,v2,v1
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [14]:
# Count the occurrences of ham and spam print them 
occ = data['v1'].value_counts() 
print(occ)

ham     4825
spam     747
Name: v1, dtype: int64


In [15]:
# Có sự chênh lệch giữa ham và spam
# cần resample dữ liệu

In [16]:
data.head()

Unnamed: 0,v2,v1
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [17]:
source= data['v2'] 
type(source)

pandas.core.series.Series

In [18]:
source.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [19]:
target = data['v1']
type(target)

pandas.core.series.Series

In [20]:
target.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: v1, dtype: object

In [21]:
#0: ham, 1:spam
target = pd.get_dummies (target, drop_first=True) 
target.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(source, target, test_size=0.2)

In [24]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer 
# Instantiate CountVectorizer
cv = CountVectorizer (stop_words='english')
cv

In [25]:
# Fit the vectorizer
cv.fit(source)

In [26]:
cv.vocabulary_

{'jurong': 4224,
 'point': 5741,
 'crazy': 2271,
 'available': 1271,
 'bugis': 1703,
 'great': 3534,
 'world': 8227,
 'la': 4349,
 'buffet': 1701,
 'cine': 1994,
 'got': 3494,
 'amore': 1051,
 'wat': 8026,
 'ok': 5343,
 'lar': 4385,
 'joking': 4192,
 'wif': 8134,
 'oni': 5369,
 'free': 3265,
 'entry': 2875,
 'wkly': 8185,
 'comp': 2110,
 'win': 8146,
 'fa': 3005,
 'cup': 2329,
 'final': 3121,
 'tkts': 7519,
 '21st': 411,
 '2005': 402,
 'text': 7388,
 '87121': 784,
 'receive': 6115,
 'question': 6010,
 'std': 7028,
 'txt': 7701,
 'rate': 6062,
 'apply': 1128,
 '08452810075over18': 77,
 'dun': 2738,
 'say': 6450,
 'early': 2757,
 'hor': 3815,
 'nah': 5092,
 'don': 2651,
 'think': 7443,
 'goes': 3458,
 'usf': 7837,
 'lives': 4535,
 'freemsg': 3272,
 'hey': 3732,
 'darling': 2386,
 'week': 8071,
 'word': 8218,
 'like': 4485,
 'fun': 3323,
 'tb': 7323,
 'xxx': 8292,
 'chgs': 1948,
 'send': 6536,
 '50': 607,
 'rcv': 6074,
 'brother': 1674,
 'speak': 6910,
 'treat': 7634,
 'aids': 985,
 'pate

In [28]:
# Apply the vectorizer
X_train_transformed = cv.transform(X_train) 
# Print the full array
#cv_array = CV_transformed. toarray()

In [29]:
# resample X_train, y_train
# resample
from imblearn.over_sampling import SMOTE 
method = SMOTE()

In [30]:
# Apply resampling to the training data only 
X_resampled, y_resampled = method.fit_resample (X_train_transformed, y_train)

In [32]:
# Count the occurrences of ham and spam and print them 
occ_no_ham = y_resampled[y_resampled ==0].size
print("Ham:", occ_no_ham)
occ_no_spam = y_resampled[y_resampled ==1].size 
print("Spam:", occ_no_spam)

Ham: 7730
Spam: 7730


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, average_precision_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve 
from sklearn.metrics import classification_report, roc_auc_score

In [37]:
# Continue fitting the model and obtain predictions
model = LogisticRegression()
model.fit(X_resampled, y_resampled)

  y = column_or_1d(y, warn=True)


In [43]:
# training score
model.score(X_resampled, y_resampled)

0.9817593790426908

In [44]:
# testing score
X_test_transformed=cv.transform(X_test)
model.score(X_test_transformed, y_test)

0.8869955156950673

In [45]:
# Get your performance metrics
yhat_test = model.predict(X_test_transformed)

In [47]:
conf_mat = confusion_matrix(y_true=y_test, y_pred=yhat_test) 
print('Confusion matrix: \n', conf_mat)

Confusion matrix: 
 [[845 115]
 [ 11 144]]


In [49]:
# Calculate average precision and the PR curve
average_precision = average_precision_score (y_test, yhat_test)
average_precision

0.5263930583473474

In [50]:
# Obtain precision and recall
precision, recall, _ = precision_recall_curve (y_test, yhat_test)
precision, recall, _

(array([0.13901345, 0.55598456, 1.        ]),
 array([1.        , 0.92903226, 0.        ]),
 array([0, 1], dtype=uint8))

In [51]:
# Obtain model probabilities
probs = model.predict_proba (X_test_transformed)
probs

array([[9.53462662e-01, 4.65373380e-02],
       [9.99434167e-01, 5.65832664e-04],
       [9.60634674e-01, 3.93653260e-02],
       ...,
       [1.09401258e-01, 8.90598742e-01],
       [8.37029762e-01, 1.62970238e-01],
       [9.99865228e-01, 1.34771576e-04]])

In [52]:
# Print ROC_AUC score using probabilities 
print (roc_auc_score (y_test, probs[:, 1] ) )

0.965766129032258


In [53]:
# Print classification report using predictions 
print(classification_report(y_test, yhat_test))

              precision    recall  f1-score   support

           0       0.99      0.88      0.93       960
           1       0.56      0.93      0.70       155

    accuracy                           0.89      1115
   macro avg       0.77      0.90      0.81      1115
weighted avg       0.93      0.89      0.90      1115



In [54]:
new_data = pd.Series(["Hi, I have received your email. I will send my assisgment on time",
                      "Valid 12 hours only."])

In [55]:
new_data_transformed = cv.transform(new_data)

In [56]:
new_data_transformed

<2x8404 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [57]:
yhat_new = model.predict(new_data_transformed)
yhat_new

array([0, 1], dtype=uint8)