# Load and Prepare Data

In [37]:
import pandas as pd

In [38]:
df=pd.read_csv(r"C:\Users\Partiran\Desktop\python\1404.10.18\Spam-Email-Detection\dataset\Spam_Email.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [39]:
df.shape

(5572, 2)

In [40]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [41]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [43]:
df.dtypes

Category    object
Message     object
dtype: object

In [44]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [45]:
df['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [46]:
df['Category'] = df['Category'].map({'ham': 0,'spam': 1})


In [47]:
df['Category'].value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

In [48]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
X = df['Message']

Y = df['Category']

# Split Data For Train & Test

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42,stratify=Y)

In [52]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


# Feature Extraction

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase = True)

In [55]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# LOGISTIC REGRESSION

In [56]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [57]:
lr = LogisticRegression(  solver='liblinear',
    max_iter=1000,
    class_weight='balanced',
    random_state=42)
lr.fit(X_train_features, Y_train)


lr_predict_test = lr.predict(X_test_features)

print("Logistic Regression:\n")
print("Testing Data Accuracy :", accuracy_score(Y_test, lr_predict_test))
print("Precision             :", precision_score(Y_test, lr_predict_test))
print("Recall                :",recall_score(Y_test, lr_predict_test))
print("F1 Score              :", f1_score(Y_test, lr_predict_test)
)

Logistic Regression:

Testing Data Accuracy : 0.9820627802690582
Precision             : 0.9448275862068966
Recall                : 0.9194630872483222
F1 Score              : 0.9319727891156463


# DECISION TREES

In [58]:
from sklearn.tree import DecisionTreeClassifier

In [59]:
dtrees = DecisionTreeClassifier(max_depth=50,
    min_samples_leaf=5,
    random_state=42)
dtrees.fit(X_train_features, Y_train)

dt_predict_test = dtrees.predict(X_test_features)


print("Decision Tress:\n")
print("Testing Data Accuracy :", accuracy_score(Y_test, dt_predict_test))
print("Precision             :", precision_score(Y_test, dt_predict_test))
print("Recall                :", recall_score(Y_test, dt_predict_test))
print("F1 Score              :", f1_score(Y_test,dt_predict_test))

Decision Tress:

Testing Data Accuracy : 0.9524663677130045
Precision             : 0.8529411764705882
Recall                : 0.7785234899328859
F1 Score              : 0.8140350877192982


# K NEAREST NEIGHBORS

In [60]:
from sklearn.neighbors import KNeighborsClassifier

In [61]:
knn = KNeighborsClassifier(n_neighbors=7,metric='cosine')
knn.fit(X_train_features, Y_train)


knn_predict_test = knn.predict(X_test_features)

print("K Nearest Neighbors:\n")
print("Testing Data Accuracy :", accuracy_score(Y_test, knn_predict_test))
print("Precision             :", precision_score(Y_test, knn_predict_test))
print("Recall                :", recall_score(Y_test, knn_predict_test))
print("F1 Score              :", f1_score(Y_test, knn_predict_test))

K Nearest Neighbors:

Testing Data Accuracy : 0.957847533632287
Precision             : 0.9636363636363636
Recall                : 0.7114093959731543
F1 Score              : 0.8185328185328186
