In [1]:
#Importing the dependencies
import numpy as np
import pandas as pd

In [2]:
#Loading the spam-ham dataset
spamfile = "spam.csv"
spamdata = pd.read_csv(spamfile, encoding="ISO-8859-1")

In [3]:
#Displaying the first five values
spamdata.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
#Keeping only the necessary columns
spamdata = spamdata[['v1','v2']]

In [5]:
spamdata.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#Checking the various parameters in the dataset
spamdata.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
# REPLACING SPAM -> 1
#           HAM  -> 0

spamdata['v1'].replace('ham',0,inplace=True)
spamdata['v1'].replace('spam',1,inplace=True)

In [8]:
spamdata.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
spamdata.rename(columns={'v1':'Prediction','v2':'Message'},inplace = True)

In [10]:
spamdata.head()

Unnamed: 0,Prediction,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
#Switching the columns for better chronology
spamdata['Prediction'],spamdata['Message'] = spamdata['Message'],spamdata['Prediction']

In [12]:
spamdata.rename(columns={'Prediction':'Message','Message':'Prediction'},inplace = True)

In [13]:
spamdata.head()

Unnamed: 0,Message,Prediction
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [14]:
#The number of rows and columns in the dataset
spamdata.shape

(5572, 2)

In [15]:
#Checking if any null values are present
spamdata.isnull().sum()

Message       0
Prediction    0
dtype: int64

In [16]:
#Checking the number of spam and ham messages
spamdata['Prediction'].value_counts()

0    4825
1     747
Name: Prediction, dtype: int64

In [17]:
#UNDERSAMPLING TO BALANCE OUT THE UNBALANCED DATA
ham = spamdata[spamdata.Prediction == 0]
spam = spamdata[spamdata.Prediction == 1]
ham.shape,spam.shape

((4825, 2), (747, 2))

In [18]:
#WE STORE 747 RANDOM HAM SAMPLES IN A VARIABLE ham_sample.
ham_sample = ham.sample(n = 747)

In [19]:
ham_sample.shape

(747, 2)

In [20]:
#We create a new dataset with the balanced number of values.
new_dataset = pd.concat([ham_sample,spam],axis = 0)

In [21]:
new_dataset['Prediction'].value_counts()

0    747
1    747
Name: Prediction, dtype: int64

In [22]:
new_dataset.head()

Unnamed: 0,Message,Prediction
3111,Just haven't decided where yet eh ?,0
4779,Sen told that he is going to join his uncle fi...,0
3145,\SHIT BABE.. THASA BIT MESSED UP.YEH,0
1452,"That's good, because I need drugs",0
991,Hi im having the most relaxing time ever! we h...,0


In [23]:
#ASSIGNING THE FEATURES TO VARIABLE X AND PREDICTION TO VARIABLE Y
X = new_dataset.drop(columns = 'Prediction',axis = 1)
Y = new_dataset['Prediction']

In [24]:
X = new_dataset['Message'].values
Y = new_dataset['Prediction'].values
print(X)
print(Y)

["Just haven't decided where yet eh ?"
 'Sen told that he is going to join his uncle finance in cbe'
 '\\SHIT BABE.. THASA BIT MESSED UP.YEH' ...
 'Had your contract mobile 11 Mnths? Latest Motorola, Nokia etc. all FREE! Double Mins & Text on Orange tariffs. TEXT YES for callback, no to remove from records.'
 'REMINDER FROM O2: To get 2.50 pounds free call credit and details of great offers pls reply 2 this text with your valid name, house no and postcode'
 'This is the 2nd time we have tried 2 contact u. U have won the å£750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.']
[0 0 0 ... 1 1 1]


In [25]:
#Using Tfidf Vectorization to encode the values
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)
print(X)

  (0, 4491)	0.4109865928581573
  (0, 4370)	0.3572986528424459
  (0, 2390)	0.23093217281347456
  (0, 2118)	0.4655097652851349
  (0, 1694)	0.4655097652851349
  (0, 1522)	0.4655097652851349
  (1, 4169)	0.35727931384566414
  (1, 4052)	0.3229376269256753
  (1, 4046)	0.09204553510719253
  (1, 3975)	0.1850574592997486
  (1, 3572)	0.35727931384566414
  (1, 2371)	0.26087001003907223
  (1, 2320)	0.14079684897951303
  (1, 2269)	0.15725861602870866
  (1, 2161)	0.28859594000568645
  (1, 2125)	0.2466169221787831
  (1, 2026)	0.2369975437140566
  (1, 1845)	0.3773679129053638
  (1, 1248)	0.3773679129053638
  (2, 4485)	0.4369676013953573
  (2, 4191)	0.2305918966667693
  (2, 3974)	0.4369676013953573
  (2, 3616)	0.3650973210992515
  (2, 2733)	0.4369676013953573
  (2, 1062)	0.38440053692376497
  :	:
  (1493, 4424)	0.15222115541737968
  (1493, 4327)	0.1429520932448488
  (1493, 4104)	0.21135786374210938
  (1493, 4029)	0.16998444758993647
  (1493, 4008)	0.1383586203937707
  (1493, 3980)	0.20618653235884432
  

In [26]:
#Importing the Logistic Regression model and other evaluation metrics.
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [27]:
X_Train,X_Test,Y_Train,Y_Test = train_test_split(X,Y,test_size = 0.2,stratify=Y,random_state=2)

In [28]:
#THE DATA HAS BEEN COREECTLY SPLITTED INTO 80% TRAINING DATA AND 20% TESTING DATA.
print(X.shape,X_Train.shape,X_Test.shape)

(1494, 4531) (1195, 4531) (299, 4531)


In [29]:
#Loading the Logistic Regression model
model = LogisticRegression()

In [30]:
#Training the logistic regression model using training data
model.fit(X_Train,Y_Train)

In [31]:
#Accuracy on training data
X_train_prediction = model.predict(X_Train)
Training_accuracy_score = accuracy_score(X_train_prediction,Y_Train)

In [32]:
print("Accuracy on training data is:",Training_accuracy_score)

Accuracy on training data is: 0.9807531380753138


In [33]:
#Accuracy on testing data
X_test_prediction = model.predict(X_Test)
Testing_accuracy_score = accuracy_score(X_test_prediction,Y_Test)

In [34]:
print("Accuracy on testing data is:",Testing_accuracy_score)

Accuracy on testing data is: 0.9364548494983278
