In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("message.csv")

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.tail()

Unnamed: 0,label,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [5]:
df.shape

(5572, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### Data Visualize and preprocess

In [7]:
df.isnull().sum()

label      0
message    0
dtype: int64

In [8]:
df.label.value_counts(1)*100

ham     86.593683
spam    13.406317
Name: label, dtype: float64

In [9]:
a = df.loc[2]['message']
a

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [10]:
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()
corpus=[]
for i in range(0,len(df)):
    message = re.sub('[^a-zA-Z]',' ',df['message'][i])
    message = message.lower()
    message = message.split()
    message = [ps.stem(word) for word in message if word not in set(stopwords.words('english'))]
    message = ' '.join(message)
    corpus.append(message)

In [11]:
len(corpus)

5572

In [12]:
df['message'].shape

(5572,)

### Train Test Split

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus,df['label'],test_size=0.25,random_state=42)

### Convert Word to Vector

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [15]:
X_train_vect.shape

(4179, 5447)

In [17]:
#dump tfidf
import pickle
pickle.dump(vectorizer, open('cv-transform.pkl', 'wb'))

### Fixing Imbalance

In [18]:
from imblearn.over_sampling import SMOTE
x_resample, y_resample = SMOTE().fit_sample(X_train_vect, y_train)
# lets print the shape of x and y after resampling it
print(x_resample.shape)
print(y_resample.shape)

(7236, 5447)
(7236,)


In [19]:
x_test_resample, y_test_resample = SMOTE().fit_sample(X_test_vect, y_test)

x_resample, y_resample,x_test_resample, y_test_resample 

### Model Building

In [20]:
from sklearn.metrics import accuracy_score, classification_report

In [21]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
clf.fit(x_resample, y_resample)

LogisticRegression()

In [22]:
y_pred = clf.predict(x_test_resample)
accuracy_score(y_test_resample,y_pred)

0.9739022369511184

In [23]:
cr = classification_report(y_test_resample, y_pred)
print(cr)

              precision    recall  f1-score   support

         ham       0.96      0.99      0.97      1207
        spam       0.99      0.96      0.97      1207

    accuracy                           0.97      2414
   macro avg       0.97      0.97      0.97      2414
weighted avg       0.97      0.97      0.97      2414



In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_resample, y_pred)

array([[1195,   12],
       [  51, 1156]], dtype=int64)

### Custom Prediction Eithout Preprocess

In [25]:
a = df.loc[0]['message']
a

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [26]:
strr = input("Enter a Message: ")
examples = [strr]
example_counts = vectorizer.transform(examples)
prediction =clf.predict(example_counts)
prediction[0]

Enter a Message: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


'ham'

### With Preprocess

In [27]:
strr = input("Enter a Message: ")
examples = strr

#preprocess
a = re.sub('[^a-zA-Z]',' ',examples)
a = a.lower()
a = a.split()
a = [ps.stem(word) for word in a if word not in set(stopwords.words('english'))]
a = ' '.join(a)  
print(a)
#apply
example_counts = vectorizer.transform([a])
prediction =clf.predict(example_counts)
prediction[0]

Enter a Message: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
go jurong point crazi avail bugi n great world la e buffet cine got amor wat


'ham'

### Model Export

In [28]:
#dumb file
import pickle
filename = 'spam_logistic.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [29]:
model = open(filename,"rb")
model = pickle.load(model)

In [30]:
cv = pickle.load(open('cv-transform.pkl','rb'))

In [31]:
def spam(example):
    #preprocess
    a = re.sub('[^a-zA-Z]',' ',example)
    a = a.lower()
    a = a.split()
    a = [ps.stem(word) for word in a if word not in set(stopwords.words('english'))]
    a = ' '.join(a)  
    print(a)
    #apply
    model = open(filename,"rb")
    model = pickle.load(model)
    example_counts = cv.transform([a])
    prediction =model.predict(example_counts)
    return prediction[0]
data = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
p = spam(data)
print(p)

free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli
spam
