In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import os

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk import NaiveBayesClassifier, classify

import string
import codecs
import random

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/prashanth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prashanth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
EMAIL_DIR = os.path.join("archive/")
SPAM_DIR = os.path.join(EMAIL_DIR, "spam")
HAM_DIR = os.path.join(EMAIL_DIR, "ham")

In [5]:
stemmer = LancasterStemmer()

In [6]:
def get_emails_list(file_dir, tag, proportion=1):
    files = os.listdir(file_dir)
    files_length = int(len(files)*proportion)
    files = files[:files_length]
    tag_list = []
    for a_file in files:
        if not a_file.startswith("."):
            with codecs.open(os.path.join(file_dir, a_file), "r", encoding="ISO-8859-1", errors="ignore") as f:
                email = f.read()
        tag_list.append((email, tag))
    return tag_list

In [7]:
spam_list = get_emails_list(SPAM_DIR, "spam", 1)
print(len(spam_list))
print(spam_list[0])

17156
("Subject: what up , , your cam babe\r\nwhat are you looking for ?\r\nif your looking for a companion for friendship , love , a date , or just good ole '\r\nfashioned * * * * * * , then try our brand new site ; it was developed and created\r\nto help anyone find what they ' re looking for . a quick bio form and you ' re\r\non the road to satisfaction in every sense of the word . . . . no matter what\r\nthat may be !\r\ntry it out and youll be amazed .\r\nhave a terrific time this evening\r\ncopy and pa ste the add . ress you see on the line below into your browser to come to the site .\r\nhttp : / / www . meganbang . biz / bld / acc /\r\nno more plz\r\nhttp : / / www . naturalgolden . com / retract /\r\ncounterattack aitken step preemptive shoehorn scaup . electrocardiograph movie honeycomb . monster war brandywine pietism byrne catatonia . encomia lookup intervenor skeleton turn catfish .\r\n", 'spam')


In [8]:
ham_list = get_emails_list(HAM_DIR, "ham", 1)
print(len(ham_list))
print(ham_list[0])

16545
("Subject: ena sales on hpl\r\njust to update you on this project ' s status :\r\nbased on a new report that scott mills ran for me from sitara , i have come up\r\nwith the following counterparties as the ones to which ena is selling gas off\r\nof hpl ' s pipe .\r\naltrade transaction , l . l . c . gulf gas utilities company\r\nbrazoria , city of panther pipeline , inc .\r\ncentral illinois light company praxair , inc .\r\ncentral power and light company reliant energy - entex\r\nces - equistar chemicals , lp reliant energy - hl & p\r\ncorpus christi gas marketing , lp southern union company\r\nd & h gas company , inc . texas utilities fuel company\r\nduke energy field services , inc . txu gas distribution\r\nentex gas marketing company union carbide corporation\r\nequistar chemicals , lp unit gas transmission company inc .\r\nsince i ' m not sure exactly what gets entered into sitara , pat clynes\r\nsuggested that i check with daren farmer to make sure that i ' m not missing\r\n

In [9]:
email_list = spam_list + ham_list
print(len(email_list))

33701


In [10]:
email_df = pd.DataFrame(email_list)

In [11]:
print(email_df.columns)

RangeIndex(start=0, stop=2, step=1)


In [12]:
email_df.rename(columns = {0:'message'}, inplace = True)
email_df.rename(columns = {1:'category'}, inplace = True)

In [13]:
email_df.columns

Index(['message', 'category'], dtype='object')

In [14]:
email_df.head

<bound method NDFrame.head of                                                  message category
0      Subject: what up , , your cam babe\r\nwhat are...     spam
1      Subject: do you know wwhat ?\r\nhello , welcom...     spam
2      Subject: friend , never be in pain again\r\nis...     spam
3      Subject: big range of all types of downloadabl...     spam
4      Subject: software\r\nmicrosoft windows xp prof...     spam
...                                                  ...      ...
33696  Subject: teco update\r\nwe received their redr...      ham
33697  Subject: " project doorstep " target date\r\nt...      ham
33698  Subject: re : a note being sent out under john...      ham
33699  Subject: associate & analyst mid - year 2001 p...      ham
33700  Subject: re : visit to enron\r\nfyi\r\n- - - -...      ham

[33701 rows x 2 columns]>

In [15]:
email_df.loc[email_df['category'] == 'spam', 'category',] = 0
email_df.loc[email_df['category'] == 'ham', 'category',] = 1


In [16]:
X = email_df['message']

Y = email_df['category']

print(X)

print(Y)


0        Subject: what up , , your cam babe\r\nwhat are...
1        Subject: do you know wwhat ?\r\nhello , welcom...
2        Subject: friend , never be in pain again\r\nis...
3        Subject: big range of all types of downloadabl...
4        Subject: software\r\nmicrosoft windows xp prof...
                               ...                        
33696    Subject: teco update\r\nwe received their redr...
33697    Subject: " project doorstep " target date\r\nt...
33698    Subject: re : a note being sent out under john...
33699    Subject: associate & analyst mid - year 2001 p...
33700    Subject: re : visit to enron\r\nfyi\r\n- - - -...
Name: message, Length: 33701, dtype: object
0        0
1        0
2        0
3        0
4        0
        ..
33696    1
33697    1
33698    1
33699    1
33700    1
Name: category, Length: 33701, dtype: object


In [17]:
# Splitting the data into training data & test data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

print(X.shape)
print(X_train.shape)
print(X_test.shape)

(33701,)
(26960,)
(6741,)


In [18]:
# Feature Extraction

# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

print(X_train)

print(X_train_features)


16333    Subject: re : re : good\r\nthe museum of the f...
26175    Subject: stats\r\nwest power desk\r\nassuming ...
25748    Subject: re : uk rab multiples\r\nvince ,\r\nw...
16620    Subject: largest pornstars collection of downl...
25941    Subject: today is the signing deadline for ful...
                               ...                        
25365    Subject: schedule crawler : hourahead failure\...
25544    Subject: hpl fuel gas buy - back for december ...
11513    Subject: penny flyer showcasing increased cont...
1688     Subject: did you know : america is giving away...
5994     Subject: sometimes less is more . . .\r\nfind ...
Name: message, Length: 26960, dtype: object
  (0, 108101)	0.09075106642955155
  (0, 111626)	0.11114671876086879
  (0, 102980)	0.08416389416151358
  (0, 69597)	0.16499466291247547
  (0, 106106)	0.1735079509612841
  (0, 29473)	0.11327993172314248
  (0, 130093)	0.10154111247295709
  (0, 121759)	0.1271113575295443
  (0, 36413)	0.16015230546445677
  (0, 

In [19]:
# Training the Model

# Logistic Regression

model = LogisticRegression()

# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)


LogisticRegression()

In [20]:
# Evaluating the trained model

# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9924703264094955


In [21]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

print('Accuracy on test data : ', accuracy_on_test_data)


Accuracy on test data :  0.9860554813825841


In [22]:
# Building a Predictive System

test_mail_list = ["Participate in our new lottery!", 
                  "See the minutes from the last meeting attached", 
                  "Investors are coming to our office on Monday", 
                  "Try out this new medicine",
                  """
                     Subject: confidential folder to safely pass information to arthur andersen
                     we have become increasingly concerned about confidential information ( dpr / position info , curves , validations / stress tests , etc ) being passed to arthur andersen for audit purposes over the web to their arthur andersen email addresses . ( necessary now they no longer have access to enron ' s internal email system )
                     please use the folder described below when passing any info ( that you would have concerns about if it was picked up by a third party ) via the shared drive that has been set up for this specific purpose .
                     note : aa should also use the shared drive to pass info back if there are questions , or the data needs updating . we should also consider the sensitivity of audit findings and special presentations if they are being distributed electronically .
                     please pass this note to others in your groups who have the need to pass info back and forth .
                     details on how to access for those who will use this method to pass info :
                     a secured folder has been set up on the " o " drive under corporate called arthur _ andersen ( o : \ corporate \ arthur _ anderson ) . please post all confidential files in this folder rather than emailing the files to their company email address . if you need access to this folder , submit an erequest through the it central site : http : / / itcentral . enron . com / data / services / securityrequests / . arthur andersen will be able to retrieve these files for review with their terminal server access at the three allen center location .
                     please contact vanessa schulte if you have any problems or questions
                     beth apollo
                  """,
                  """
                     Subject: yukos oil
                     dear friend ,
                     i am mr olsom berghart a personal treasurer to mikhail khodorkovsky the richest man in russia and owner of the following companies : chairman ceo : yukos oil ( russian most largest oil company ) chairman ceo : menatep sbp bank ( a well reputable financial institution with its branches all over the world )
                     source of funds :
                     i have a profiling amount in an excess of us $ 100 , 500 , 000 which i seek your partnership in accommodating for me . you will be rewarded with 4 % of the total sum for your partnership . can you be my partner on this ?
                     introduction of my self
                     as a personal consultant to him , authority was handed over to me in transferring money of an american oil merchant for his last oil deal with my boss mikhail khodorkovsky . already the funds have left the shore of russia to a european private bank where
                     the final crediting is expected to be carried out . while i was on the process , my boss got arrested for his involvement in politics by financing the leading and opposing political parties ( the union of right forces , led by boris nemtsov , and yabloko , a liberal / social democratic party led by gregor yavlinsky ) which poses treat to president vladimir putin second tenure as russian president . you can catch more of the story on the following website :
                     your role :
                     all i need from you is to stand as the beneficiary of the above quoted sum and i will re - profile the funds with your name , which will enable the european bank transfer the sum to you . i have decided to use this sum to relocate to your country and never to
                     be connected to any of mikhail khodorkovsky conglomerates . the transaction has to be concluded before my boss is out from jail . as soon as i confirm your readiness to conclude the transaction with me , i will provide you with the details .
                     thank you very much
                     regards
                     olsom berghart ( mr )
                     mail sent from webmail service at php - nuke powered site
                     - http : / / yoursite . com
                  """
                 ]

for input_mail in test_mail_list:
    input_mail=[input_mail]
    # convert text to feature vectors
    input_data_features = feature_extraction.transform(input_mail)

    # making prediction

    prediction = model.predict(input_data_features)
    print(int(prediction),end=' ')


    if (prediction[0]==1):
      print('Ham mail')

    else:
      print('Spam mail')


0 Spam mail
1 Ham mail
1 Ham mail
0 Spam mail
1 Ham mail
0 Spam mail


In [23]:
import string
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

def sentiment_analyse(sentiment_text):
    score = SentimentIntensityAnalyzer().polarity_scores(sentiment_text)
    if score['neg'] > score['pos']:
        print("Negative Sentiment",end=' ')
    elif score['neg'] < score['pos']:
        print("Positive Sentiment",end=' ')
    else:
        print("Neutral Sentiment",end=' ')


for input_mail in test_mail_list:
    text = input_mail
    lower_case = text.lower()
    cleaned_text = lower_case.translate(str.maketrans('', '', string.punctuation))
    
    # Using word_tokenize because it's faster than split()
    tokenized_words = word_tokenize(cleaned_text, "english")

    # Removing Stop Words
    final_words = []
    for word in tokenized_words:
        if word not in stopwords.words('english'):
            final_words.append(word)

    # Lemmatization - From plural to single + Base form of a word (example better-> good)
    lemma_words = []
    for word in final_words:
        word = WordNetLemmatizer().lemmatize(word)
        lemma_words.append(word)

    emotion_list = []
    with open('emotions.txt', 'r') as file:
        for line in file:
            clear_line = line.replace("\n", '').replace(",", '').replace("'", '').strip()
            word, emotion = clear_line.split(': ')

            if word in lemma_words:
                emotion_list.append(emotion)

    if not emotion_list:
        print("Neutral Sentiment")
    else:
        sentiment_analyse(cleaned_text)
        print(emotion_list)



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prashanth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/prashanth/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/prashanth/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Neutral Sentiment
Neutral Sentiment ['attached']
Neutral Sentiment
Neutral Sentiment
Positive Sentiment ['attracted']
Positive Sentiment ['attached']


In [24]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

for input_mail in test_mail_list:
    text = input_mail
    lower_case = text.lower()
    cleaned_text = lower_case.translate(str.maketrans('', '', string.punctuation))
    
    # Using word_tokenize because it's faster than split()
    tokenized_words = word_tokenize(cleaned_text, "english")

    # Removing Stop Words
    final_words = []
    for word in tokenized_words:
        final_words.append(word)

    # Lemmatization - From plural to single + Base form of a word (example better-> good)
    lemma_words = []
    for word in final_words:
        word = WordNetLemmatizer().lemmatize(word)
        lemma_words.append(word)

    action_list = []
    with open('action.txt', 'r') as file:
        for line in file:
            clear_line = line.replace("\n", '').replace(",", '').replace("'", '').strip()
            word, action = clear_line.split(': ')
            word, action = word.lower(), action.capitalize()
            if word in lemma_words:
                if action not in action_list:
                    action_list.append(action)

    if not action_list:
        print("Neutral Action")
    else:
        print(action_list)



['Neutral']
['Forward']
Neutral Action
Neutral Action
['Reply', 'Forward']
['Reply']


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prashanth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/prashanth/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/prashanth/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
