In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"/content/drive/MyDrive/Arch_Technologies_Tasks/spam.csv", encoding='latin1')
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.isnull().sum()

Unnamed: 0,0
v1,0
v2,0
Unnamed: 2,5522
Unnamed: 3,5560
Unnamed: 4,5566


In [4]:
#dropping null columns
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace=True)
df.head(5)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
#renaming v1, v2 to appropriate names
df.rename(columns={'v1':'label', 'v2':'sms'}, inplace=True)
df.head(5)

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.shape

(5572, 2)

In [7]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def clean_text(text):
  text = text.lower()
  text = re.sub(r"[^a-zA-Z\s]", '', text)
  text = re.sub(r"^https\S+|http\S+|www\S+|WWW\S+", '', text)
  text = text.split()
  text = [word for word in text if word not in ENGLISH_STOP_WORDS]
  cleanedText = " ".join(text)
  return cleanedText

df['mail_text'] = df['sms'].apply(lambda x: clean_text(x))

In [8]:
from sklearn.model_selection import train_test_split

X = df['mail_text']
Y = df['label']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=37)
Y_train.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,3860
spam,597


In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_test = le.transform(Y_test)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfid = TfidfVectorizer()
X_train_vectorize = tfid.fit_transform(X_train)
X_test_vectorize = tfid.transform(X_test)

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train_vectorize, Y_train)

In [12]:
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()
model = NB.fit(X_train_resampled, Y_train_resampled)
Y_pred = model.predict(X_test_vectorize)

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print("Accuracy:", accuracy_score(Y_test, Y_pred))
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

Accuracy: 0.9650224215246637
[[937  28]
 [ 11 139]]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       965
           1       0.83      0.93      0.88       150

    accuracy                           0.97      1115
   macro avg       0.91      0.95      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [14]:
def nlp_transform(X):
    return [clean_text(text) for text in X]

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score

pipeline = Pipeline([
  ("NLP_pipeline", FunctionTransformer(nlp_transform)),
  ("Vectorizer", TfidfVectorizer(stop_words='english')),
  ("model", MultinomialNB())
])

pipe = pipeline.fit(X_train, Y_train)
Y_predict = pipe.predict(X_test)

accuracy = accuracy_score(Y_test, Y_predict)
print("Accuracy:", accuracy)

Accuracy: 0.9659192825112107


In [18]:
def spam_check(text):
  sms = np.array([text])
  pred = pipe.predict(sms)
  return pred

# t= 'i am rameen, with github https://rameen84. student of bs 3rd semester'
# result = spam_check(t)

t1= 'you win iphone'
result = spam_check(t1)

if result == 0:
  print("Not spam")
else:
  print("spam")

spam
