# Importing libraries and dataset

## Importing libraries

In [1]:
# In the first we import the libararies that will be used in our project like numpy, pandas, and matplotlip
import numpy as np
import pandas as pd
# Here we import train_test_split to split our dataset into traning and testsets
from sklearn.model_selection import train_test_split
# Here we import TfidVectorizer to convert text to numerical
from sklearn.feature_extraction.text import TfidfVectorizer
# Here we import LogisticRegression to classify the email spam or not
from sklearn.linear_model import LogisticRegression
# Here we import accuracy_score to get the accuracy of our model
from sklearn.metrics import accuracy_score

## Importing Dataset

In [2]:
df = pd.read_csv('mail_data.csv')

In [3]:
# Here is the number of columns and rows in our dataset
df_rows, df_columns = df.shape
print('The number of Columns:',df_columns)
print('The number of rows:',df_rows)

The number of Columns: 2
The number of rows: 5572


In [4]:
# Here we can see the first 10 rows of our dataset
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


# Data cleaning

## Handel nulls

In [5]:
# Here we create a new dataframe called dataset to convert any null value to empty string ''
dataset = df.where((pd.notnull(df)),'')

In [6]:
# Here we can see there is no any null value
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## Encode categories to numberical

In [7]:
dataset.loc[dataset['Category'] == 'spam','Category',] = 0
dataset.loc[dataset['Category'] == 'ham','Category',] = 1

In [8]:
# Here are the first 10 rows after encode categories
dataset.head(10)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...
7,1,As per your request 'Melle Melle (Oru Minnamin...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...


## Create input and turget features x & y

In [9]:
# Create to variables x (x is the input features) and y (y is the target features)
x = dataset['Message']
y = dataset['Category']

In [10]:
# Print x
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [11]:
# Print y
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


## Split x and y into train and test sets

In [12]:
# Here we split the dataset into 80% in training and 20% in testing
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state = 3)

In [13]:
# print x_train shape
print(x_train.shape)

(4457,)


In [14]:
# print x_test shape
print(x_test.shape)

(1115,)


In [15]:
# print y_train shape
print(y_train.shape)

(4457,)


In [16]:
# print y_test shape
print(y_test.shape)

(1115,)


# Feature extraction and convert categories to integer

## Feature extraction (NLP)

In [17]:
# NLP Natural Language Processing
# Here we creating a new variable from TfidVectorizer class to convert text to numerical
feature_extraction = TfidfVectorizer(min_df = 1,stop_words='english',lowercase=True)

# Convert x_train to numberical
x_train_features = feature_extraction.fit_transform(x_train)
# Convert x_test to numberical
x_test_features = feature_extraction.transform(x_test)

In [18]:
# Print x_train after using feature extraction
print(x_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [19]:
# Print x_test after using feature extraction
print(x_test_features)

  (0, 7271)	0.1940327008179069
  (0, 6920)	0.20571591693537986
  (0, 5373)	0.2365698724638063
  (0, 5213)	0.1988547357502182
  (0, 4386)	0.18353336340308998
  (0, 1549)	0.2646498848307188
  (0, 1405)	0.3176863938914351
  (0, 1361)	0.25132445289897426
  (0, 1082)	0.2451068436245027
  (0, 1041)	0.28016206931555726
  (0, 405)	0.2381316303003606
  (0, 306)	0.23975986557206702
  (0, 20)	0.30668032384591537
  (0, 14)	0.26797874471323896
  (0, 9)	0.2852706805264544
  (0, 1)	0.2381316303003606
  (1, 7368)	0.29957800964520975
  (1, 6732)	0.42473488678029325
  (1, 6588)	0.3298937975962767
  (1, 6507)	0.26731535902873493
  (1, 6214)	0.3621564482127515
  (1, 4729)	0.22965776503163893
  (1, 4418)	0.3457696891316818
  (1, 3491)	0.496093956101028
  (2, 7205)	0.22341717215670331
  :	:
  (1110, 3167)	0.5718357066163949
  (1111, 7353)	0.4991205841293424
  (1111, 6787)	0.40050175714278885
  (1111, 6033)	0.4714849709283488
  (1111, 3227)	0.44384935772735523
  (1111, 2440)	0.4137350055985486
  (1112, 7071)

## Convert categories to integer

In [20]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

# Building model

### Creating the model

In [21]:
model = LogisticRegression()

### Fitting the model

In [22]:
model.fit(x_train_features,y_train)

### Evaluate the model

In [23]:
# Prediction on train set
y_predict_on_train_data = model.predict(x_train_features)

# The model accuracy of train set
model_accuracy_on_train_set = accuracy_score(y_train,y_predict_on_train_data)

In [24]:
# printing the accuracy on training set
print('Model accuracy on training set:',model_accuracy_on_train_set)

Model accuracy on training set: 0.9670181736594121


In [25]:
# Prediction on test set
y_predict_on_test_data = model.predict(x_test_features)

# The model accuracy of test set
model_accuracy_on_test_set = accuracy_score(y_test, y_predict_on_test_data)

In [26]:
# printing the accuracy on test set
print('Model accuracy on test set:', model_accuracy_on_test_set)

Model accuracy on test set: 0.9659192825112107


# Make a single prediction

### Single prediction method

In [27]:
def single_prediction(input_text):
  # Convert the input text to a numerical feature vector
  input_data_features = feature_extraction.transform([input_text])

  # Print the shape of the input feature vector for debugging
  print(f"Input data features shape: {input_data_features.shape}")

  # Make a prediction using the trained model
  prediction = model.predict(input_data_features)

  # Print the prediction result
  if prediction == 0:
    print('RESULT: This is a spam email 😡')
  else:
    print('RESULT: This is not a spam email 😊')

### Make a single prediction

In [28]:
# Some spam and not spam mails
'''
>> Spam Examples:
1- "Congratulations! You've won a free vacation. Click here to claim your prize."
2- "Get rich quick! Start earning thousands of dollars with our easy money-making scheme."
3- "Limited time offer! Buy one get one free on all products. Don't miss out!"

>> Non-Spam (Ham) Examples:
1- "Hey, how's it going? Just wanted to check in and see how you're doing."
2- "Reminder: Your appointment is scheduled for tomorrow at 2 PM. Please confirm."
3- "Here's the report you requested. Let me know if you need any further information."

'''

'\n>> Spam Examples:\n1- "Congratulations! You\'ve won a free vacation. Click here to claim your prize."\n2- "Get rich quick! Start earning thousands of dollars with our easy money-making scheme."\n3- "Limited time offer! Buy one get one free on all products. Don\'t miss out!"\n\n>> Non-Spam (Ham) Examples:\n1- "Hey, how\'s it going? Just wanted to check in and see how you\'re doing."\n2- "Reminder: Your appointment is scheduled for tomorrow at 2 PM. Please confirm."\n3- "Here\'s the report you requested. Let me know if you need any further information."\n\n'

In [31]:
mail = "Hey, how's it going? Just wanted to check in and see how you're doing."

single_prediction(mail)

Input data features shape: (1, 7431)
RESULT: This is not a spam email 😊


### Geting mail from user

In [32]:
input_mail = input("Enter your mail:")
single_prediction(input_mail)

Enter your mail:Hello mohamed howrare your
Input data features shape: (1, 7431)
RESULT: This is not a spam email 😊
