## Importing dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Reading and pre-processing the dataset

In [2]:
raw_data = pd.read_csv('spam.csv')
raw_data

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# information about dataset
raw_data.describe()

Unnamed: 0,Label,EmailText
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
# replacing the null values with a null string
df = raw_data.where((pd.notnull(raw_data)),'')

In [5]:
df.describe()

Unnamed: 0,Label,EmailText
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


### Encoding the dataset

#### Transforming Labels

In [6]:
# encoding labels
df.loc[df['Label']=='spam', 'Label'] = 0
df.loc[df['Label']=='ham', 'Label'] = 1

In [7]:
# separating data for input and output

x = df['EmailText']
y = df['Label']
df

Unnamed: 0,Label,EmailText
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will Ã_ b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


Now , spam - 0 and ham - 1

In [8]:
# converting type of data from object into int

y = y.astype('int')
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Label, Length: 5572, dtype: int32

#### Feature Extraction / transforming text

In [9]:
# transforming text data to feature vectors so that they can be used as input to LOgistic Regression

features = TfidfVectorizer(min_df =1, stop_words='english', lowercase = 'True')
x = features.fit_transform(x)
x

<5572x8411 sparse matrix of type '<class 'numpy.float64'>'
	with 43338 stored elements in Compressed Sparse Row format>

In [10]:
print(x)

  (0, 8044)	0.19609779550499865
  (0, 1052)	0.3509649021061901
  (0, 3502)	0.16470488207184114
  (0, 1999)	0.2964965675440533
  (0, 1704)	0.33503393550839805
  (0, 4358)	0.2964965675440533
  (0, 8247)	0.23740046706740073
  (0, 3542)	0.19387320529717864
  (0, 1706)	0.2964965675440533
  (0, 1272)	0.2625103008882829
  (0, 2276)	0.27179815735762314
  (0, 5753)	0.2745089285415426
  (0, 4233)	0.3509649021061901
  (1, 5381)	0.5465881710238072
  (1, 8154)	0.4316010362639011
  (1, 4201)	0.5236458071582338
  (1, 4394)	0.4082988561907181
  (1, 5355)	0.27211951321382544
  (2, 77)	0.23759715224911548
  (2, 1129)	0.1707825659976717
  (2, 6074)	0.1707825659976717
  (2, 7717)	0.12576907263059747
  (2, 7043)	0.1989696587085652
  (2, 6022)	0.1808417865094903
  (2, 6128)	0.16914304332607796
  :	:
  (5567, 461)	0.2333398621010977
  (5567, 5130)	0.2445888397614688
  (5567, 8222)	0.19074118816829963
  (5567, 2005)	0.185955090206136
  (5567, 5906)	0.19532744699307247
  (5567, 6074)	0.23098372602432177
  (556

#### Splitting the data into training and testing data

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 3)

In [12]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(5572,)
(4457,)
(1115,)


In [13]:
print(x_train)

  (0, 4837)	0.3816566913936048
  (0, 5065)	0.29674499431882045
  (0, 3823)	0.32776902226267624
  (0, 3680)	0.26248402292530365
  (0, 4483)	0.241970995011807
  (0, 3816)	0.21963704298028186
  (0, 856)	0.31960863550589313
  (0, 2406)	0.38215478435135547
  (0, 7404)	0.19966203055688628
  (0, 3542)	0.44170454536979026
  (1, 7224)	0.5571549395247277
  (1, 7197)	0.4830859573998644
  (1, 7709)	0.4337888090885529
  (1, 4577)	0.37752322051231246
  (1, 8349)	0.3542736487657533
  (2, 1074)	0.49161752871381814
  (2, 6531)	0.49161752871381814
  (2, 2520)	0.4153204748776681
  (2, 4410)	0.3977938354070683
  (2, 2391)	0.4311535440909176
  (3, 1829)	0.5883652732070248
  (3, 8360)	0.5328083183021187
  (3, 2100)	0.37129564256376313
  (3, 6879)	0.4817480119785171
  (4, 2861)	0.733035546358576
  :	:
  (4452, 2406)	0.3055743167628613
  (4453, 1148)	0.6788117394887004
  (4453, 8164)	0.58361036823015
  (4453, 2014)	0.44566081320505785
  (4454, 2368)	0.3781079704098718
  (4454, 6024)	0.4289060232258301
  (4454

In [14]:
y_train

3075    1
1787    1
1614    1
4304    1
3266    1
       ..
789     1
968     1
1667    1
3321    1
1688    1
Name: Label, Length: 4457, dtype: int32

## Initializing and training the model

In [15]:
# initialization
model = LogisticRegression()
model

In [16]:
# training
model.fit(x_train, y_train)
model

## Evaluating the model

In [17]:
# on training data

pred_y = model.predict(x_train)
accuracy = accuracy_score(y_train, pred_y)
print('Accuracy on traiining data is ', accuracy)

Accuracy on traiining data is  0.9663450751626654


In [18]:
# on testing data

pred_y_test = model.predict(x_test)
accuracy_test = accuracy_score(y_test, pred_y_test)
print('Accuracy on testing data is ', accuracy_test)

Accuracy on testing data is  0.9497757847533632


## Saving the model

In [19]:
import joblib
joblib.dump(model, 'SpamEmail_Logistic_model.pkl') # to save the model

['SpamEmail_Logistic_model.pkl']

In [20]:
# re_model = joblib.load('SpamEmail_Logistic_model.pkl') # to load the model

## Creating system for prediction

In [21]:
def check(inp):
    inp_features = features.transform(inp)
    classification = joblib.load('SpamEmail_Logistic_model.pkl')
    pred = classification.predict(inp_features)
    
    if(pred[0]==1):
        print('Input Email is a Ham mail.')
    else:
        print('Input Email is a Spam mail.')

In [22]:
check(["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"])

Input Email is a Ham mail.
