# E-MAIL CLASSIFICATION

### Importing Libraries

In [None]:
import pandas as pd
from sklearn import preprocessing
import nltk 
nltk.download('stopwords')                 # download the stopwords from NLTK

import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

from sklearn.linear_model import LogisticRegression  
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt            # library for visualization
import seaborn as sns

### Getting our Data

In [None]:
df = pd.read_csv('../input/email-classification-nlp/SMS_train.csv', encoding='unicode_escape')
df

### Data Preprocessing

In [None]:
df = df.drop(['S. No.'], axis=1)  # dropping unnecesary column
label_encoder = preprocessing.LabelEncoder()  # label encoding for 'Label' column
df['Label'] = label_encoder.fit_transform(df['Label'])   # label encoding column - MSZoning for an example 
df

In [None]:
df.isnull().any()  # checking for null values if any

### Performing steps in NLP

In [None]:
def process_mail(mail):
    """Process mail function.
    Input:
        mail: a string containing meassage body
    Output:
        mail_clean: a list of words containing the processed body

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # tokenize reviews
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    mail_tokens = tokenizer.tokenize(mail)

    mail_clean = []
    for word in mail_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # mail_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            mail_clean.append(stem_word)

    return mail_clean

In [None]:
# using the process_mail function for:
# 1. Removing stop words
# 2. Tokenization
# 3. Stemming
A = []
a = df['Message_body']
for i in a:
  i = process_mail(i)
  A.append(i)
df['Message_body'] = A
df

### Vectorizing

In [None]:
cv = CountVectorizer(max_features=1500, analyzer='word', lowercase=False) 

In [None]:
df['Message_body'] = df['Message_body'].apply(lambda x: " ".join(x) )  # to join all words in the lists
X = cv.fit_transform(df['Message_body'])  # predictor variable 'X'

In [None]:
df

In [None]:
y = pd.DataFrame(df['Label'])  # respose variable 'y'
y.head()

### Splitting for Training and Testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)  # splitting in the ratio 80:20

### Model

In [None]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

### Making Predictions

In [None]:
y_pred = classifier.predict(X_test)

### Checking Accuracy

In [None]:
roc_auc_score(y_test, y_pred)

# Predictions are 78.33% accurate.

### Results' Visualization

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=0.5, square = True, cmap = 'Pastel1')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(roc_auc_score(y_test, y_pred))
plt.title(all_sample_title, size = 15)