# Loading Data

In [1]:
import pandas as pd
from textblob import TextBlob

In [2]:
# Read the data
df = pd.read_csv("emails.csv")
# Remove duplicates
df.drop_duplicates(["text"], ignore_index = True, inplace = True)
# Review
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5695 entries, 0 to 5694
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5695 non-null   object
 1   spam    5695 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.1+ KB


# Cleaning the Text and Writing the Polarity and Subjectivity of Each Text

In [3]:
from nltk.corpus import stopwords
import string

In [4]:
# Set the preferred list of stopwords
sw = stopwords.words("english") + ["subject", "re"]

In [5]:
# Initiate an empty dataframe for the final data with sentiment-related information
sendf = pd.DataFrame(columns = list(df.columns)[: -1] + ["polarity", "subjectivity"])

In [6]:
# Initilize empty lists for the clean email texts, polarities and subjectivities
texts = []
pols = []
subjs = []
for i in range(len(df)):
    mainblob = TextBlob(df["text"][i])
    # Translation table for removing punctuations
    t = str.maketrans("", "", string.punctuation)
    finalText = ""
    for w in mainblob.words:
        w = str(w).translate(t)
        if w.isalpha() and not w in sw:
            finalText += w.lower() + " "
    finalText = finalText[: -1]
    # Store the clean text in the dataset
    texts.append(finalText)
    # The sentiment analysis blob
    sentimentblob = TextBlob(finalText)
    # Get the polarity and the subjectivity and push to the dataset
    pol = sentimentblob.sentiment.polarity
    subj = sentimentblob.sentiment.subjectivity
    pols.append(pol)
    subjs.append(subj)
# Add the clean texts, polarities and subjectivities to the dataset
sendf["text"] = texts
sendf["polarity"] = pols
sendf["subjectivity"] = subjs

In [7]:
sendf.head()

Unnamed: 0,text,polarity,subjectivity
0,subject naturally irresistible corporate ident...,0.305639,0.549373
1,subject stock trading gunslinger fanny merrill...,0.106746,0.562698
2,subject unbelievable new homes made easy im wa...,0.040229,0.480581
3,subject color printing special request additio...,0.163492,0.468254
4,subject money get software cds software compat...,0.433333,0.395833
