In [1]:
import streamlit as st #for webapp
import pandas as pd
import numpy as np
import re
import nltk #natural language toolkit
import sklearn

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ekjot\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:

from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [5]:
file='spam.csv'
#check encoding to avoid encoding error while reading file with pandas
import chardet
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.7270322499829184, 'language': ''}

In [6]:
#NLP Model
df = pd.read_csv(file,encoding='Windows-1252')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
df=df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [8]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.rename(columns = {'v1':'labels','v2':'message'},inplace=True)

In [10]:
df.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
#For duplicate values
df.shape

(5572, 2)

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.shape

(5169, 2)

In [14]:
df['labels']=df['labels'].map({'ham':0,'spam':1})

In [15]:
df.head()

Unnamed: 0,labels,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
#remove stop words and punctuation marks
def clean_data(message):
    #removing punctuation and returning a list of remaining characters of message
    message_without_punc=[character for character in message if character not in string.punctuation]
    
    message_without_punc=''.join(message_without_punc)
    separator=' '
    return separator.join(word for word in message_without_punc.split() if word.lower() not in stopwords.words('english'))

In [17]:
#applying the function to the dataset
df['message']=df['message'].apply(clean_data)

In [18]:
#defining x and y variables
x=df['message']
y=df['labels']
#converting all words to vectors(basic step in any nlp model)
#cv=variable storing oblject of class CountVectorizer
cv = CountVectorizer()
x=cv.fit_transform(x)
print(x)

  (0, 3764)	1
  (0, 4652)	1
  (0, 6381)	1
  (0, 2482)	1
  (0, 1401)	1
  (0, 1864)	1
  (0, 3861)	1
  (0, 9107)	1
  (0, 4809)	1
  (0, 1862)	1
  (0, 2198)	1
  (0, 3821)	1
  (0, 1168)	1
  (0, 8871)	1
  (1, 5946)	1
  (1, 4848)	1
  (1, 4620)	1
  (1, 9003)	1
  (1, 5978)	1
  (2, 3556)	1
  (2, 3141)	2
  (2, 9059)	1
  (2, 2314)	1
  (2, 9017)	1
  (2, 3278)	2
  :	:
  (5165, 4156)	1
  (5165, 3783)	1
  (5165, 3543)	1
  (5165, 3169)	1
  (5166, 5523)	1
  (5166, 6307)	1
  (5166, 7547)	1
  (5166, 7922)	1
  (5167, 3556)	1
  (5167, 4308)	1
  (5167, 4960)	1
  (5167, 8920)	1
  (5167, 5731)	1
  (5167, 8657)	1
  (5167, 7572)	1
  (5167, 3085)	1
  (5167, 3676)	1
  (5167, 1899)	1
  (5167, 4444)	1
  (5167, 3922)	1
  (5167, 988)	1
  (5167, 1649)	1
  (5168, 5643)	1
  (5168, 8468)	1
  (5168, 6996)	1


In [19]:
#split into training and testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [20]:
#fitting data on model
model=MultinomialNB().fit(x_train,y_train)
predictions=model.predict(x_test)

In [21]:
print(accuracy_score(y_test,predictions))
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

0.965183752417795
[[861  24]
 [ 12 137]]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       885
           1       0.85      0.92      0.88       149

    accuracy                           0.97      1034
   macro avg       0.92      0.95      0.93      1034
weighted avg       0.97      0.97      0.97      1034



In [25]:
#web application:
#function:
def predict(text):
    labels=['Not Spam','Spam']
    x=cv.transform(text).toarray()
    p=model.predict(x)
    s=[str(i) for i in p]
    v=int(''.join(s))
    return str('This message is probably: '+labels[v])

[0]
This message is probably: Not Spam


In [25]:
#designing the web app
st.title('Spam Classifier')
st.image('image.jpg')
user_input=st.text_input('Write your message here.')
submit=st.button('Predict')
if submit:
    answer=predict([user_input])
    st.text(answer)

  command:

    streamlit run c:\programdata\anaconda3\envs\streamlitapp\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
