# An implementation of Spam Classification using Logistic Regression and solving the Logistic Regression manually using Gradient Descent as well as with Scikit Learn

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/spamtest/spam.csv')
df

# **About the Dataset**


****Columns****

Email

Text
Label

****Labels****

Spam

Ham -> Not Spam

# Exploratory Data Analysis

In [None]:
#Plotting a bar graph to see the number of spam and not spam emails.
import matplotlib.pyplot as plt
spam = (df.loc[df['Label'] == 'spam'])['Label'].count()
notspam = ((df.loc[df['Label'] == 'ham'])['Label'].count())
labels = ['Spam', 'Not Spam']
values = [spam, notspam]
print(values)
plt.bar(labels,values)
plt.show()


# Out of the given training set of 5572 training examples the following is the compostion:
**747** Spam

**4825** Not Spam

In [None]:
import string
import nltk
from nltk.corpus import stopwords
def text_preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in  stopwords.words('english')]
    return " ".join(text)

# Preprocessing the text
Removing punctuations and stop words.

In [None]:
email=[]
for text in df['EmailText']:
    email.append(text_preprocess(text))
type(email) # contains the list of pre processed emails

In [None]:
df['EmailText'] = email
df

# Vectorization
 Feature Engineering

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df['EmailText'])

In [None]:
X=np.array(x.toarray())
np.shape(X)

# Making the prediction vector by treating Spam as 1 and not spam as 0.

In [None]:
y=[]
for pred in df['Label']:
    if(pred == 'spam'):
        y.append(1)
    else:
        y.append(0)

In [None]:
a= np.ones((np.shape(X)[0],1))
X = np.append(a,X,axis=1)
np.shape(X)

# Splitting the test Data

In [None]:
#Splitting the test data
from sklearn.model_selection import train_test_split
(TrainX,ValuateX,Trainy,Valuatey) = train_test_split(X,y,random_state=1)

# The data set has been split into 2 parts, one to train the model and the other to Evaluate the accuracy of the model

In [None]:
print(np.shape(TrainX))
print(np.shape(Trainy))

In [None]:
print(np.shape(ValuateX))
print(np.shape(Valuatey))

# Solving Logistic Regression by calculating the parameters through Gradient Descent

# **The Hypothesis**
   In the case of Logistic Regression the Hypothesis is of the form, **hypothesis = **sigmoid(Transpose(theta)X)****

In [None]:
import math
def hypothesis(theta,X):
    h = -1*(np.matmul(X,theta))
    h = 1/(1+np.exp(h))
    return h

# The Cost Function

In [None]:
def cost(theta,X,y,m):
    t1 = np.ones(np.shape(y)[0])
    h = hypothesis(theta,X)
    t2 = np.ones(np.shape(h)[0])
    c = np.sum((y*np.log(h)) +((t1+y)*np.log(t2-h) ))/m
    return c

# Applying Gradient Descent

In [None]:
def gradientDescent(theta,X,y,m,alpha,it):
    iteration = []
    c = []
    for i in range(0,it):
       # d = alpha*((hypothesis(theta,X)-y)*X)
        d = hypothesis(theta,X)-y
        d = np.sum((X*d[0])*alpha,axis=0)
       # print(np.shape(d))
        theta = theta - d
        temp = cost(theta,X,y,m)
        #print(temp)
        c.append(-1*cost(theta,X,y,m))
        iteration.append(i)
    return(theta,iteration,c)

In [None]:
np.shape(TrainX)[1] #no of features
theta = np.zeros(np.shape(TrainX)[1])
np.shape(TrainX)
(theta,i,c) = gradientDescent(theta,TrainX,Trainy,np.shape(TrainX[0]),0.000001,2000)

# Plotting the cost function vs Iterations to check the proper functioning of Gradient descent and check the value of parameter alpha.

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.plot(i,c)
plt.xlabel('Iterations ->')
plt.ylabel('Cost function ->')

# The cost function decreases with time and approaches a constant value, therefore the computed parameters are acceptable.
# The following value of thetas form our hypothesis in the form H=sigmoid(X.Theta)

In [None]:
theta

# Library Implementation Of Solving Logistic Regression using liblinear solver and imposing a penalty l1

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(TrainX, Trainy)
pred = Spam_model.predict(ValuateX)
accuracy_score(Valuatey,pred)

# An accuracy of around 98 percent is Achieved while using this model.