In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("/content/spam.csv")
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data.describe()

Unnamed: 0,Label,EmailText
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [None]:
print("Shape of data is:",data.shape)

Shape of data is: (5572, 2)


In [None]:
data.isna().sum()

Unnamed: 0,0
Label,0
EmailText,0


In [None]:
data["EmailText"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

**REMOVING NULL OR REDUNDANT VALUES**

In [None]:
data.duplicated().value_counts()

Unnamed: 0,count
False,5169
True,403


In [None]:
data = data.drop_duplicates()
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data.duplicated().value_counts()

Unnamed: 0,count
False,5169


**DATA VISUALIZATION**

In [None]:
df = data.copy()
df.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df['num_characters'] = df['EmailText'].apply(len)
df['num_words'] = df['EmailText'].apply(lambda x:len(nltk.word_tokenize(x)))
df['num_sentences'] = df['EmailText'].apply(lambda x:len(nltk.sent_tokenize(x)))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
df.head()

In [None]:
sns.countplot(df, x = "Label")

Here, we can see that the quantity of spam is relatively less than ham messages.

In [None]:
sns.pairplot(df, hue="Label")

In [None]:
sns.heatmap(df.corr(),annot = True)

**DATA PREPROCESSING**

In [None]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopwords_list = list(stopwords.words('english'))
print(stopwords_list)

In [None]:
import string
punctuation_ls = list(string.punctuation)
print(punctuation_ls)

In [None]:
from nltk.stem import PorterStemmer
pstem =PorterStemmer()

In [None]:
# Step 1: remove al punctuation marks

def remove_punctuation(x):
    for i in punctuation_ls:
        x = x.replace(i, "")
    return x

# Step 2: convert to lower case
def convert_lowercase(x):
    return x.lower()

# Step 3: Tokennization
def tokenization(x):
    return word_tokenize(x)

#step 4: remove stop words
def remove_stopwords(tok_x):
    res = []

    for word in tok_x:
        if word not in stopwords_list:
            res.append(word)
    return res

# step 5: stemming/lemmatization
def stemming_words(ls_x):
    res = []
    for i in range(0, len(ls_x)):
        res.append(pstem.stem(ls_x[i]))
    return res


In [None]:
def pre_processing(x):
    x = remove_punctuation(x)
    x = convert_lowercase(x)
    x = tokenization(x)
    x = remove_stopwords(x)
    x = stemming_words(x)
    x = " ".join(x)

    return x

In [None]:
x = "I am playing football. Can i go home?"
pre_processing(x)

In [None]:
data["Processed_EmailText"] = data["EmailText"].apply(pre_processing)

In [None]:
data.head(10)

In [None]:
data = data.drop("EmailText",axis=1)

In [None]:
data.head()

In [None]:
data["Label"] = data["Label"].map({"ham":1, "spam":0})

In [None]:
data.head()

**COUNT VECTORIZER**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
x_vc = count_vec.fit_transform(data["Processed_EmailText"]).toarray()

In [None]:
x_vc

**TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
x_tf = tf_idf.fit_transform(data["Processed_EmailText"]).toarray()

**TRAIN-TEST SPLIT FOR COUNT-VECTORIZER**

In [None]:
y = data["Label"]

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test , y_train, y_test = train_test_split(x_vc,y, test_size=0.5, random_state=45, stratify =y)

**MODEL SELECTION**

In [None]:
from sklearn.svm import SVC
svm = SVC()
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
from sklearn.ensemble import BaggingClassifier
bc = BaggingClassifier()
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
from sklearn.ensemble import StackingClassifier
sc = StackingClassifier(estimators=rfc)
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators=svm)

**GRID-SEARCH CV : HYPER-PARAMETER TUNING**

**SVC**

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
tuned_parameters = {'kernel': ['rbf', 'sigmoid'], 'gamma': ['scale', 'auto'],
                     'C': [1, 1.5, 2, 5, 10, 100]}

model = GridSearchCV(estimator=SVC(), param_grid=tuned_parameters, scoring = "f1", cv = 5)

model.fit(x_train, y_train)

In [None]:
model.best_estimator_

In [None]:
svm = SVC(C=5, kernel='sigmoid')
svm.fit(x_train,y_train)

In [None]:
ypred_train = svm.predict(x_train)
ypred_test = svm.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train))
print("TEST\n")
print(classification_report(y_test, ypred_test))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(svm, x_test, y_test)

**RANDOM FOREST CLASSIFIER**

In [None]:
parameters = {'criterion':("gini", "entropy", "log_loss"),
              'max_depth':(5,7,8,10),
              'bootstrap':(True, False),
              'random_state':(45,12,85,35),
              'warm_start':(True, False)}

In [None]:
from sklearn.model_selection import GridSearchCV
model1 = GridSearchCV(estimator = rfc , param_grid = parameters, n_jobs=-1)

In [None]:
model1.fit(x_train,y_train)

In [None]:
model1.best_estimator_

In [None]:
rfc = RandomForestClassifier(bootstrap=False, max_depth=10, random_state=45,
                       warm_start=True)
rfc.fit(x_train,y_train)

In [None]:
ypred_train1 = rfc.predict(x_train)
ypred_test1 = rfc.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train1))
print("TEST\n")
print(classification_report(y_test, ypred_test1))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(rfc, x_test, y_test)

**BAGGING CLASSIFIER**

In [None]:
param = {"n_estimators":(7,5,2),
         "max_samples":(5,12),
         "bootstrap":(True,False),
         "warm_start":(True,False),
         }

In [None]:
from sklearn.model_selection import GridSearchCV
model2 = GridSearchCV(estimator = bc , param_grid = param, n_jobs=-1)
model2.fit(x_train,y_train)
model2.best_estimator_

In [None]:
bc = BaggingClassifier(max_samples=12, n_estimators=2, warm_start=True)
bc.fit(x_train,y_train)

In [None]:
ypred_train2 = bc.predict(x_train)
ypred_test2 = bc.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train2))
print("TEST\n")
print(classification_report(y_test, ypred_test2))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(bc, x_test, y_test)

**ADABOOST CLASSIFIER**

In [None]:
param2 = { ("n_estimators"):(50,10,12),
          ("learning_rate"):(1.0,1.2,0.3),
          ("random_state"):(45,52,20)}

In [None]:
from sklearn.model_selection import GridSearchCV
model3 = GridSearchCV(estimator = abc , param_grid = param2, n_jobs=-1)
model3.fit(x_train,y_train)
model3.best_estimator_

In [None]:
abc = AdaBoostClassifier(learning_rate=1.2, random_state=45)

In [None]:
abc.fit(x_train,y_train)
ypred_train3 = abc.predict(x_train)
ypred_test3 = abc.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train3))
print("TEST\n")
print(classification_report(y_test, ypred_test3))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(abc, x_test, y_test)

**DECISION TREE CLASSIFIER**

In [None]:
param3 ={ ("criterion"):("gini", "entropy", "log_loss"),
         ("max_depth"):(10,12,7),
          ("random_state"):(10,45,23)
}

In [None]:
from sklearn.model_selection import GridSearchCV
model4 = GridSearchCV(estimator = dtc , param_grid = param3, n_jobs=-1)
model4.fit(x_train,y_train)
model4.best_estimator_

In [None]:
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=12, random_state=23)

In [None]:
dtc.fit(x_train,y_train)
ypred_train4 = dtc.predict(x_train)
ypred_test4 = dtc.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train4))
print("TEST\n")
print(classification_report(y_test, ypred_test4))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(dtc, x_test, y_test)

**GRADIENT BOOSTING CLASSIFIER**

In [None]:
param4 = {("loss"):("log_loss", "deviance", "exponential"),
          ("learning_rate"):(1.2,2.0,0.6),
          ("criterion"):("friedman_mse", "squared_error")
}

In [None]:
from sklearn.model_selection import GridSearchCV
model5 = GridSearchCV(estimator = gbc , param_grid = param4, n_jobs=-1)
model5.fit(x_train,y_train)
model5.best_estimator_

In [None]:
gbc = GradientBoostingClassifier(criterion='squared_error', learning_rate=2.0,
                           loss='exponential')

In [None]:
gbc.fit(x_train,y_train)
ypred_train5 = gbc.predict(x_train)
ypred_test5 = gbc.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train5))
print("TEST\n")
print(classification_report(y_test, ypred_test5))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(gbc, x_test, y_test)

**LOGISTIC REGRESSION**

In [None]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

In [None]:
ypred_train6 = lr.predict(x_train)
ypred_test6 = lr.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train6))
print("TEST\n")
print(classification_report(y_test, ypred_test6))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(lr, x_test, y_test)

**MULTINOMIALNB**

In [None]:
mnb = MultinomialNB()
mnb.fit(x_train,y_train)

In [None]:
ypred_train7 = mnb.predict(x_train)
ypred_test7 = mnb.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train7))
print("TEST\n")
print(classification_report(y_test, ypred_test7))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(mnb, x_test, y_test)

**TRAIN-TEST SPLIT FOR TFIDF-VECTORIZER**

In [None]:
y = data["Label"]

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test , y_train, y_test = train_test_split(x_tf,y, test_size=0.5, random_state=45, stratify =y)

**MODEL SELECTION**

**GRID-SEARCH CV : HYPER-PARAMETER TUNING**

**SVC**

In [None]:
svm = SVC()
svm.fit(x_train,y_train)

In [None]:
ypred_train = svm.predict(x_train)
ypred_test = svm.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train))
print("TEST\n")
print(classification_report(y_test, ypred_test))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(svm, x_test, y_test)

**ADABOOST CLASSIFIER**

In [None]:
abc = AdaBoostClassifier()
abc.fit(x_train,y_train)

In [None]:
ypred_train1 = abc.predict(x_train)
ypred_test1 = abc.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train1))
print("TEST\n")
print(classification_report(y_test, ypred_test1))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(abc, x_test, y_test)

**GRADIENT BOOSTING CLASSIFIER**

In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(x_train,y_train)

In [None]:
ypred_train2 = gbc.predict(x_train)
ypred_test2 = gbc.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train2))
print("TEST\n")
print(classification_report(y_test, ypred_test2))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(gbc, x_test, y_test)

**LOGISTIC REGRESSION**

In [None]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

In [None]:
ypred_train3 = lr.predict(x_train)
ypred_test3 = lr.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train5))
print("TEST\n")
print(classification_report(y_test, ypred_test5))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(lr, x_test, y_test)

**MULTINOMIALNB**

In [None]:
mnb = MultinomialNB()
mnb.fit(x_train,y_train)

In [None]:
ypred_train4 = mnb.predict(x_train)
ypred_test4 = mnb.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print("TRAIN\n")
print(classification_report(y_train, ypred_train4))
print("TEST\n")
print(classification_report(y_test, ypred_test4))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(mnb, x_test, y_test)