In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [2]:
import sklearn
print(dir(sklearn))

['__SKLEARN_SETUP__', '__all__', '__builtins__', '__cached__', '__check_build', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_config', '_distributor_init', '_loss', 'base', 'clone', 'config_context', 'datasets', 'exceptions', 'externals', 'feature_extraction', 'get_config', 'linear_model', 'logger', 'logging', 'metrics', 'model_selection', 'naive_bayes', 'os', 'pipeline', 'preprocessing', 'random', 'set_config', 'setup_module', 'show_versions', 'svm', 'sys', 'utils']


**Data  Gathering**

In [43]:
# Try reading the CSV file with a different encoding

# Try reading the CSV file with a different encoding
df = pd.read_csv("spam.csv", encoding='latin-1')

# Drop columns with "Unnamed" in the column name
df = df.loc[:, ~df.columns.str.contains('Unnamed')]
df.rename(columns={'v1': 'Label', 'v2': 'Msg'}, inplace=True)
df.head()

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Data Analysis**

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   Msg     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [18]:
df.isna().sum()

Label    0
Msg      0
dtype: int64

In [19]:
df['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

**DATA PROCESSING**

In [24]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [47]:
corpus = []
lm = WordNetLemmatizer()
for i in range (len(df)):
  review = re.sub('^a-zA-Z0-9',' ',df['Msg'][i])
  review = review.lower()
  review = review.split()
  review = [data for data in review if data not in stopwords.words('english')]
  review = [lm.lemmatize(data) for data in review]
  review = ' '.join(review)
  corpus.append(review)


In [36]:
len(df['Msg'])

5572

In [50]:
df['Msg']=corpus
df.head()

Unnamed: 0,Label,Msg
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"


**Model Building**

***----Data splitting-------***

In [51]:
x = df['Msg']
y = df['Label']

In [52]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=10)

In [53]:
len(x_train),len(y_train)

(3900, 3900)

In [54]:
len(x_test),len(y_test)

(1672, 1672)

**----Vectorization(Convert Text Data into the vectors)**

In [55]:
tf_obj = TfidfVectorizer()
x_train_tfidf = tf_obj.fit_transform(x_train).toarray()
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [57]:
x_train_tfidf.shape      #gives sentences and total words at output

(3900, 6983)

**----Pipeline**

In [58]:
text_mnb = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [60]:
text_mnb.fit(x_train,y_train)

In [61]:
# accuracy score on testing data
y_pred_test = text_mnb.predict(x_test)
print('Accuracy Score',accuracy_score(y_test,y_pred_test)*100)

Accuracy Score 96.71052631578947


In [63]:
# Accuracy score on training data
y_pred_train = text_mnb.predict(x_train)
print('Accuracy Score',accuracy_score(y_train,y_pred_train)*100)

Accuracy Score 98.0


In [65]:
# confusion matrix on testing data
y_pred_test = text_mnb.predict(x_test)
print('Confusion Matrix on test data:\n',confusion_matrix(y_test,y_pred_test))

Confusion Matrix on test data:
 [[1446    0]
 [  55  171]]


In [66]:
#classification report on testing data
y_pred_test = text_mnb.predict(x_test)
print('Classification Reportx on test data:\n',classification_report(y_test,y_pred_test))

Classification Reportx on test data:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1446
        spam       1.00      0.76      0.86       226

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.97      1672



**Prediction on user_data**

In [67]:
def preprocess_data(text):
  review = re.sub('^a-zA-Z0-9',' ',text)
  review = review.lower()
  review = review.split()
  review = [data for data in review if data not in stopwords.words('english')]
  review = [lm.lemmatize(data) for data in review]
  review = ' '.join(review)
  return [review]

In [73]:
user_data = df['Msg'][0]     #tesing
user_data = preprocess_data(user_data)

In [75]:
text_mnb.predict(user_data)[0]

'ham'

In [91]:
class prediction:
  def __init__(self,data):
    self.data=data

  def user_data_preprocessing(self):
    lm= WordNetLemmatizer()
    review = re.sub('^a-zA-Z0-9',' ',self.data)
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = ' '.join(review)
    return [review]

  def user_data_prediction(self):
    preprocess_data = self.user_data_preprocessing()

    if text_mnb.predict(preprocess_data)[0]=="spam":
      return "This message is spam"

    else:
      return "This message is ham"


# user_data ---> data_preprocessing --->vector(tfidf)---> MultinomialNB()

In [94]:
df.head()

Unnamed: 0,Label,Msg
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"


In [92]:
# trial
user_data = df['Msg'][2]
print(user_data)
prediction(user_data).user_data_prediction()

free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's apply 08452810075over18's


'This message is spam'

In [93]:
# trial
user_data = df['Msg'][4]
print(user_data)
prediction(user_data).user_data_prediction()

nah think go usf, life around though


'This message is ham'