In [1]:
# OIBSIP
# TASK 4         : Email spam detection with machine learning
# AUTHOR         : Shreyas Ghodekar
# MODEL          : Logistic Regression
# TECHNIQUE USED : Classification

In [2]:
# Import Required
# 1. python modules
# 2. Spam Dataset

In [3]:
import pandas as pd
import numpy as np

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [6]:
# Perfroming EDA

In [7]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [8]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [9]:
df['Unnamed: 3'].nunique()

10

In [10]:
df.shape

(5572, 5)

In [11]:
sd = df.where((pd.notnull(df)),'')

In [12]:
sd

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [13]:
sd['Unnamed: 3'].nunique()

11

In [14]:
sd.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [15]:
# Drop irrelevant columns
sd=sd.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [16]:
sd

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [17]:
sd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [18]:
sd.shape

(5572, 2)

In [19]:
# replacing spam emails to value 0 and ham emails to 1

In [20]:
sd.loc[sd['v1']== 'spam' , 'v1',] = 0
sd.loc[sd['v1']== 'ham' , 'v1',] = 1

In [21]:
sd

Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will Ì_ b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [22]:
X = sd['v2'] # independent variable
y = sd['v1'] # target variable

In [23]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [24]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: object

In [25]:
# spliting the data into training and testing dataset

In [26]:
x_train , x_test , y_train ,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [27]:
x_train

1114    No no:)this is kallis home ground.amla home to...
3589    I am in escape theatre now. . Going to watch K...
3095    We walked from my moms. Right on stagwood pass...
1012       I dunno they close oredi not... ÌÏ v ma fan...
3320                               Yo im right by yo work
                              ...                        
4931                Match started.india  &lt;#&gt;  for 2
3264    44 7732584351, Do you want a New Nokia 3510i c...
1653    I was at bugis juz now wat... But now i'm walk...
2607    :-) yeah! Lol. Luckily i didn't have a starrin...
2732    How dare you stupid. I wont tell anything to y...
Name: v2, Length: 4457, dtype: object

In [28]:
x_train.shape

(4457,)

In [29]:
x_test

4456    Aight should I just plan to come up later toni...
690                                    Was the farm open?
944     I sent my scores to sophas and i had to do sec...
3768    Was gr8 to see that message. So when r u leavi...
1189    In that case I guess I'll see you at campus lodge
                              ...                        
2906                                               ALRITE
1270    Sorry chikku, my cell got some problem thts y ...
3944    I will be gentle princess! We will make sweet ...
2124    Beautiful Truth against Gravity.. Read careful...
253     Ups which is 3days also, and the shipping comp...
Name: v2, Length: 1115, dtype: object

In [30]:
x_test.shape

(1115,)

In [31]:
y_train

1114    1
3589    1
3095    1
1012    1
3320    1
       ..
4931    1
3264    0
1653    1
2607    1
2732    1
Name: v1, Length: 4457, dtype: object

In [32]:
y_train.shape

(4457,)

In [33]:
y_test

4456    1
690     1
944     1
3768    1
1189    1
       ..
2906    1
1270    1
3944    1
2124    1
253     1
Name: v1, Length: 1115, dtype: object

In [34]:
y_test.shape

(1115,)

In [35]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english' , lowercase = True)

In [36]:
x_train_feature = feature_extraction.fit_transform(x_train)

In [37]:
x_test_feature= feature_extraction.transform(x_test)

In [38]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [39]:
print(x_train_feature)

  (0, 2400)	0.42251087562056844
  (0, 6643)	0.310713090556495
  (0, 890)	0.4431414936624499
  (0, 3102)	0.4078732191722945
  (0, 3308)	0.4607061502580205
  (0, 3697)	0.38724260113041314
  (1, 4285)	0.3619488551509563
  (1, 3709)	0.49218179847458676
  (1, 7020)	0.3597932878999011
  (1, 3022)	0.2656832920063487
  (1, 6479)	0.46190436338926344
  (1, 2530)	0.46190436338926344
  (2, 3109)	0.15859116597265116
  (2, 4045)	0.15859116597265116
  (2, 777)	0.24853230530973786
  (2, 3267)	0.3059351024463395
  (2, 6904)	0.3323889186374277
  (2, 3867)	0.22778533625897432
  (2, 7140)	0.3323889186374277
  (2, 4836)	0.2640067957824946
  (2, 6113)	0.3323889186374277
  (2, 5497)	0.39905624733507106
  (2, 4344)	0.29741887579744203
  (2, 6985)	0.3059351024463395
  (3, 2642)	0.4893788451570101
  :	:
  (4454, 5637)	0.25666584238764617
  (4454, 1470)	0.30396107829387736
  (4454, 2095)	0.24269967159421676
  (4454, 7019)	0.2053843287832964
  (4454, 3827)	0.23135590834159414
  (4454, 1497)	0.23226820104119308
  

In [40]:
print(x_test_feature)

  (0, 6612)	0.44133360030716196
  (0, 4969)	0.48162181516639807
  (0, 3830)	0.36941813963683484
  (0, 3685)	0.2933033468633235
  (0, 1825)	0.3360436396796962
  (0, 835)	0.48769164177028673
  (1, 4699)	0.6356544605069046
  (1, 2650)	0.7719737086440678
  (2, 6500)	0.2647849043766693
  (2, 6497)	0.18286900821334037
  (2, 5725)	0.20865752050574918
  (2, 5656)	0.3333685702158412
  (2, 5651)	0.3333685702158412
  (2, 5650)	0.24336433824754297
  (2, 5441)	0.31784847169730807
  (2, 4688)	0.28030500289520244
  (2, 3649)	0.2854162204263955
  (2, 2592)	0.31784847169730807
  (2, 1941)	0.2310094530874928
  (2, 1901)	0.21172133705603058
  (2, 956)	0.3333685702158412
  (3, 7019)	0.2810306614616012
  (3, 6833)	0.2197458374935889
  (3, 5650)	0.3611024778218544
  (3, 4975)	0.39782804078108147
  :	:
  (1112, 5133)	0.3175110130269726
  (1112, 4123)	0.25589588829634596
  (1112, 4025)	0.22905135128379178
  (1112, 2961)	0.8232187749490736
  (1113, 6702)	0.263361044612427
  (1113, 5311)	0.24992020099815576
  (

In [41]:
# Creating Logistic Regression model

In [42]:
model = LogisticRegression()

In [43]:
model.fit(x_train_feature, y_train)

In [44]:
# Models Accuracy on Training dataset

In [45]:
prediction_on_training_data = model.predict(x_train_feature)

In [46]:
accuracyn_on_training_data = accuracy_score(y_train,prediction_on_training_data)

In [47]:
accuracyn_on_training_data

0.9699349338119811

In [48]:
# Models Accuracy on Testing dataset

In [49]:
prediction_on_testing_data = model.predict(x_test_feature)

In [50]:
accuracyn_on_testing_data = accuracy_score(y_test,prediction_on_testing_data)

In [51]:
accuracyn_on_testing_data

0.9560538116591928

In [52]:
# Testing The model with custom emails .

In [53]:
custom_spam_mail = ["""Congratulations! You've been selected as a winner of our exclusive giveaway. You've won a luxury vacation package to an exotic destination. To claim your prize, simply click the link below and provide your personal information.Click here to claim your prize: [spammy-link]Hurry up! This offer is only valid for a limited time. Don't miss out on this amazing opportunity!Best regards,The Prize Giveaway Team"""]

In [54]:
custom_ham_mail = ["""Hi Team,Just a friendly reminder that we have a team meeting scheduled for tomorrow at 10:00 AM in the conference room. We'll be discussing the upcoming project and assigning tasks to each team member. Please come prepared with any updates or questions you may have.Looking forward to seeing you all there.Best regards,John"""]

In [56]:
# 0 : Spam
# 1 : Ham

In [57]:
input1 = feature_extraction.transform(custom_spam_mail)
input2 = feature_extraction.transform(custom_ham_mail)

In [58]:
prediction1 = model.predict(input1)
prediction1

array([0])

In [59]:
prediction2 = model.predict(input2)
prediction2

array([1])