In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
## logistic regression for binary classification- 2 classes(spam and ham mail)
from sklearn.metrics import accuracy_score 


# data collection and pre processing

In [2]:
raw_mail_data=pd.read_csv('E:\materials\mail_data.csv')


In [3]:
raw_mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
# there are many null values, i didnt see when i checked

In [6]:
## replace null values with null string
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),'') # replaced with empty string

In [7]:
mail_data.shape

(5572, 2)

In [8]:
##  didnt change anything like i expected

In [9]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
## label encoding
## to convert category into numbers



In [12]:
## label spam mail as 0, and ham mail as 1

mail_data.loc[mail_data['Category']=='spam','Category']=0
mail_data.loc[mail_data['Category']=='ham','Category']=1

In [None]:
## this can be done using label enocder too

In [13]:
mail_data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [15]:
## seperating data as text and labels

X=mail_data['Message']
Y=mail_data['Category']

In [16]:
print(X)
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


# splitting data into training and test data


In [18]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [19]:
print(X.shape, X_train.shape, X_test.shape)

(5572,) (4457,) (1115,)


In [20]:
## empty because there is only one column

# feature extraction

In [21]:
## to transform text data to feature vectors that can be used as input to the logistic regression

In [None]:
# tfidfvectorizer go throuh every word and give score to all those words 
# then model goes through the score given by tfidfvectorizer
## if minimum score is atleast 1 then only that word is considered
## stop words-common words are ignored
## all letters changed to lowercase

In [22]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

In [24]:
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)  ## we never fit the model with test data

In [26]:
## for y the datatype is objects(since here we didnt use label encoder), need to convert them into integers

Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [27]:
print(X_train_features)
     

  (0, 7289)	0.517250079608171
  (0, 2823)	0.517250079608171
  (0, 3764)	0.22046319970004674
  (0, 2262)	0.4931693086193514
  (0, 7438)	0.2996693624522655
  (0, 4768)	0.28858793133473676
  (1, 4136)	0.4717788963273522
  (1, 6517)	0.49481520325330863
  (1, 1558)	0.4236400720998954
  (1, 4972)	0.49481520325330863
  (1, 3317)	0.32904344933475643
  (2, 5798)	0.2821824162510531
  (2, 3835)	0.2623708342584191
  (2, 4943)	0.33789703751914013
  (2, 5837)	0.1845655907506494
  (2, 1430)	0.28509060215711635
  (2, 6641)	0.20096909705626312
  (2, 3722)	0.24768901862403342
  (2, 3935)	0.3671145612703168
  (2, 3118)	0.18009671431232455
  (2, 4269)	0.2543939099135892
  (2, 3398)	0.20665621299033204
  (2, 2136)	0.180851695270251
  (2, 3086)	0.27449720225122765
  (2, 4099)	0.186263215205624
  :	:
  (4454, 5765)	0.27366476899994313
  (4454, 4205)	0.27366476899994313
  (4454, 6404)	0.2834859847167938
  (4454, 387)	0.2598225428978842
  (4454, 865)	0.26604684225670366
  (4454, 2972)	0.2598225428978842
  (445

# training the model

In [28]:
# logistic regression

In [29]:
model=LogisticRegression()


In [30]:
## training logistic regression model with training data

model.fit(X_train_features,Y_train)

LogisticRegression()

# evaluating the model

In [32]:
# prediction on training data

X_train_prediction=model.predict(X_train_features) #model has to predict y value now

training_data_accuracy=accuracy_score(X_train_prediction, Y_train)
print(training_data_accuracy)


0.9672425398249944


In [33]:
## very good accuracy

In [34]:
# prediction on testing data

X_test_prediction=model.predict(X_test_features) #model has to predict y value now

testing_data_accuracy=accuracy_score(X_test_prediction, Y_test)
print(testing_data_accuracy)


0.9704035874439462


In [35]:
## woww , usually training data shows more accuray

## y need to find accuracy of both training an test data is ..model may perform very good on training data and not in test data this is called overfitting
## of trained accuracy is very high comparing to test accuracy that means we have overfitted the model with training data

# making a predictive system

In [36]:
## new mail will get predicted if its spam or ham mail

In [38]:
input_mail=["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my"]

In [39]:
## convert text to numerical-feature extraction

input_data_features=feature_extraction.transform(input_mail)

In [41]:
# making prediction

prediction=model.predict(input_data_features)
print(prediction)


[1]


In [42]:
if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

Ham mail


In [None]:
## loaded the data as pandas dataframe
## label encoding- converted category into numbers
## seperated data as text and labels
## splitting as traing and test data
## feature extraction from text datav(x train and x test )
## training model
## evaluating model
## prediction using model- making a predictive system