In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
raw_mail_data = pd.read_csv('/Users/chizurumokereolujie/Documents/AI RELATED/MACHINE LEARNING CLASS/Datasets/mail_data.csv')

In [3]:
raw_mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [7]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
mail_data.shape

(5572, 2)

In [9]:
mail_data.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

### Label Encoding

#### Label a spam mail as 0 and ham mail as 1.

In [29]:
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

In [11]:
mail_data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [12]:
mail_data['Category'].value_counts()

Category
1    4825
0     747
Name: count, dtype: int64

In [13]:
# Seperating the data as texts and label

X = mail_data['Message']
Y = mail_data['Category']

In [14]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [15]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

### ***Splitting the datasets into training data and testing data***

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=3)

In [17]:
X.shape, X_train.shape, X_test.shape

((5572,), (4457,), (1115,))

#### *Convert Text Data to Numerial Value known as Feature Extraction*

In [18]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#* Convert Y_train and Y_test to integer
Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)


In [19]:
print(X_train_features)

  (0, 2353)	0.28101404009316056
  (0, 3806)	0.28101404009316056
  (0, 1857)	0.17073786814794129
  (0, 1193)	0.22908400928709988
  (0, 3113)	0.28101404009316056
  (0, 3627)	0.25144905621529934
  (0, 6330)	0.24059246244542992
  (0, 1540)	0.17407870571957915
  (0, 2644)	0.28101404009316056
  (0, 5029)	0.17467075796896542
  (0, 4306)	0.26793132631329497
  (0, 421)	0.25144905621529934
  (0, 4557)	0.28101404009316056
  (0, 6468)	0.26793132631329497
  (0, 1657)	0.28101404009316056
  (0, 0)	0.23628394623676158
  (1, 3982)	0.4167622750027118
  (1, 5911)	0.2761926296686631
  (1, 3941)	0.20702870014136815
  (1, 1969)	0.1749293187718031
  (1, 6553)	0.4722950153731612
  (1, 5947)	0.24356944504246256
  (1, 3948)	0.2761926296686631
  (1, 2113)	0.1985161464110967
  (1, 3828)	0.13684128003316173
  :	:
  (4456, 5091)	0.1743505991070133
  (4456, 7339)	0.13767285254208542
  (4456, 2414)	0.17857780047236718
  (4456, 3262)	0.16745892210920407
  (4456, 7434)	0.1566807006510762
  (4456, 4120)	0.11213756070741

### Training our model using the Logistic Regression Model

In [20]:
model = LogisticRegression()

In [21]:
model.fit(X_train_features, Y_train)

### Evaluating our Trained  Model

In [22]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(prediction_on_training_data, Y_train)

In [23]:
'Accuracy on training data : ', accuracy_on_training_data

('Accuracy on training data : ', 0.9667938074938299)

In [24]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(prediction_on_test_data, Y_test)

In [25]:
'Accuracy on test data : ', accuracy_on_test_data

('Accuracy on test data : ', 0.9713004484304932)

### Building a predictive model

In [26]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

input_data_features = feature_extraction.transform(input_mail)

prediction = model.predict(input_data_features)

print(prediction)


[1]


In [27]:
if (prediction[0] == 1):
    print('Ham mail')
else:
    print('Spam mail')

Ham mail
