# SMS Span Detector (Kaggle Dataset)

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [4]:
df= pd.read_csv('spam.csv', header=None, encoding = 'ISO-8859-1')

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4
0,v1,v2,,,
1,ham,"Go until jurong point, crazy.. Available only ...",,,
2,ham,Ok lar... Joking wif u oni...,,,
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
4,ham,U dun say so early hor... U c already then say...,,,


In [6]:
df=df.iloc[1:,:2]
df.columns = ['Category','Message']

In [7]:
df.head(100)

Unnamed: 0,Category,Message
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
96,spam,Your free ringtone is waiting to be collected....
97,ham,Watching telugu movie..wat abt u?
98,ham,i see. When we finish we have loads of loans t...
99,ham,Hi. Wk been ok - on hols now! Yes on for a bit...


In [8]:
data = df.where((pd.notnull(df)), '')
data.head()

Unnamed: 0,Category,Message
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 1 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [10]:
data.shape

(5572, 2)

In [11]:
data.loc[data['Category'] == 'spam' , 'Category' ,] =0
data.loc[data['Category'] == 'ham', 'Category',] =1

In [12]:
X= data['Message']
Y= data['Category']

In [13]:
print(X)

1       Go until jurong point, crazy.. Available only ...
2                           Ok lar... Joking wif u oni...
3       Free entry in 2 a wkly comp to win FA Cup fina...
4       U dun say so early hor... U c already then say...
5       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5568    This is the 2nd time we have tried 2 contact u...
5569                Will Ì_ b going to esplanade fr home?
5570    Pity, * was in mood for that. So...any other s...
5571    The guy did some bitching but I acted like i'd...
5572                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [14]:
print(Y) #spam is now 0 and ham is 1

1       1
2       1
3       0
4       1
5       1
       ..
5568    0
5569    1
5570    1
5571    1
5572    1
Name: Category, Length: 5572, dtype: object


In [15]:
#splitting into testing and training data
X_train, X_test, Y_train,Y_test = train_test_split(X,Y, test_size=0.2,random_state =3)

In [16]:
print(X.shape)
print(X_train.shape) #80% of dataset used for training model
print(X_test.shape) #20% of dataset used for testing

(5572,)
(4457,)
(1115,)


In [82]:
feature_extraction =  TfidfVectorizer(min_df = 1, stop_words='english', lowercase= True) 
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [84]:
print(X_train)

3076    Mum, hope you are having a great day. Hoping t...
1788                           Yes:)sura in sun tv.:)lol.
1615    Me sef dey laugh you. Meanwhile how's my darli...
4305                Yo come over carlos will be here soon
3267                    Ok then i come n pick u at engin?
                              ...                        
790                          Gud mrng dear hav a nice day
969             Are you willing to go for aptitude class.
1668    So now my dad is gonna call after he gets out ...
3322    Ok darlin i supose it was ok i just worry too ...
1689                     Nan sonathaya soladha. Why boss?
Name: Message, Length: 4457, dtype: object


In [86]:
print(X_train_features)

  (0, 741)	0.3219352588930141
  (0, 3979)	0.2410582143632299
  (0, 4296)	0.3891385935794867
  (0, 6599)	0.20296878731699391
  (0, 3386)	0.3219352588930141
  (0, 2122)	0.38613577623520473
  (0, 3136)	0.440116181574609
  (0, 3262)	0.25877035357606315
  (0, 3380)	0.21807195185332803
  (0, 4513)	0.2909649098524696
  (1, 4061)	0.380431198316959
  (1, 6872)	0.4306015894277422
  (1, 6417)	0.4769136859540388
  (1, 6442)	0.5652509076654626
  (1, 7443)	0.35056971070320353
  (2, 933)	0.4917598465723273
  (2, 2109)	0.42972812260098503
  (2, 3917)	0.40088501350982736
  (2, 2226)	0.413484525934624
  (2, 5825)	0.4917598465723273
  (3, 6140)	0.4903863168693604
  (3, 1599)	0.5927091854194291
  (3, 1842)	0.3708680641487708
  (3, 7453)	0.5202633571003087
  (4, 2531)	0.7419319091456392
  :	:
  (4452, 2122)	0.31002103760284144
  (4453, 999)	0.6760129013031282
  (4453, 7273)	0.5787739591782677
  (4453, 1762)	0.45610005640082985
  (4454, 3029)	0.42618909997886
  (4454, 2086)	0.3809693742808703
  (4454, 3088)

In [28]:
model = LogisticRegression()

In [88]:
model.fit(X_train_features, Y_train)

In [90]:
prediction_trained_model = model.predict(X_train_features)
accuracy_trained_data = accuracy_score(Y_train, prediction_trained_model)

In [92]:
print('Accuracy on trained data: ', accuracy_trained_data)

Accuracy on trained data:  0.9661207089970832


In [94]:
prediction_tested_model = model.predict(X_test_features)
accuracy_tested_data= accuracy_score(Y_test, prediction_tested_model)

In [98]:
print('Accuracy on test data', accuracy_tested_data)

Accuracy on test data 0.9623318385650225


In [110]:
input_sms=["This is the 2nd time we have tried to contact u . U have won A£400 prize. 2 claim is easy, just call 071243646377."]

#converting text to data:
input_data_features = feature_extraction.transform(input_sms)
prediction = model.predict(input_data_features)

# print(prediction)
if (prediction[0] ==1):
    print('Not spam')
else :
    print (' This mail is spam')

spam
