In [61]:
import pandas as pd
import numpy as np

In [62]:
df = pd.read_csv('chapter_4/enron_emails_clean.csv')
df.shape

(2090, 6)

In [63]:
df.head()

Unnamed: 0,Message-ID,From,To,Date,content,clean_content
0,<8345058.1075840404046.JavaMail.evans@thyme>,('advdfeedback@investools.com'),('advdfeedback@investools.com'),2002-01-29 23:20:55,INVESTools Advisory\nA Free Digest of Trusted ...,investools advisory free digest trusted invest...
1,<1512159.1075863666797.JavaMail.evans@thyme>,('richard.sanders@enron.com'),('richard.sanders@enron.com'),2000-09-20 19:07:00,----- Forwarded by Richard B Sanders/HOU/ECT o...,forwarded richard b sanders hou ect pm justin ...
2,<26118676.1075862176383.JavaMail.evans@thyme>,('m..love@enron.com'),('m..love@enron.com'),2001-10-30 16:15:17,hey you are not wearing your target purple shi...,hey wearing target purple shirt today mine wan...
3,<10369289.1075860831062.JavaMail.evans@thyme>,('leslie.milosevich@kp.org'),('leslie.milosevich@kp.org'),2002-01-30 17:54:18,Leslie Milosevich\n1042 Santa Clara Avenue\nAl...,leslie milosevich santa clara avenue alameda c...
4,<26728895.1075860815046.JavaMail.evans@thyme>,('rtwait@graphicaljazz.com'),('rtwait@graphicaljazz.com'),2002-01-30 19:36:01,"Rini Twait\n1010 E 5th Ave\nLongmont, CO 80501...",rini twait e th ave longmont co rtwait graphic...


# Word search with dataframes

#### Let's find all emails that mention specific words, such as "sell enron stock"

In [64]:
mask = df.clean_content.str.contains('sell enron stock', na =False)  #na=False to ingnore all rows containing missing values
print(df[mask])

                                       Message-ID                        From  \
154  <6336501.1075841154311.JavaMail.evans@thyme>  ('sarah.palmer@enron.com')   

                             To                 Date  \
154  ('sarah.palmer@enron.com')  2002-02-01 14:53:35   

                                               content  \
154  \nJoint Venture: A 1997 Enron Meeting Belies O...   

                                         clean_content  
154  joint venture enron meeting belies officers cl...  


In [65]:
(df[mask]).shape

(1, 6)

#### Using list of terms

In [66]:
searchList = ['enron stock', 'sell stock', 'stock bonus', 'sell enron stock']

filtered_mails = df.loc[df['clean_content'].str.contains('|'.join(searchList), na =False)] 

In [67]:
print(filtered_mails.shape)

(314, 6)


#### create a flag from this which you can use as a feature in a machine learning model

In [68]:
df['flag'] =  np.where((df.clean_content.str.contains('|'.join(searchList))==True),1,0)

In [69]:
df['flag'].value_counts()

0    1776
1     314
Name: flag, dtype: int64

You have now managed to search for a list of strings in several lines of text data. <br>
These skills come in handy when you want to flag certain words based on what you discovered in your topic model, or when you know beforehand what you want to search for. In the next cells you're going to learn how to clean text data and to create your own topic model to further look for indications of fraud in your text data.

## Cleaning Text Data:
1. `Tokenization`: split the text-sentences, sentences-Words, remove Punctuations(strings into list of Substrings)
2. Remove `stopwords`
3. Lemmatize words: `Passive-Active` and `(Past,Future) - Present`
4. stem the words :  reduce them to their `root` Form


In [70]:
from nltk.corpus import stopwords
import string

### 1. Removing stopwords

In [71]:
# Define stopwords to exclude
stop = set(stopwords.words())
stop.update(("to","cc","subject","http","from","sent", "ect", "u", "fwd", "www", "com"))

In [72]:
# Define punctuations to exclude and lemmatizer
exclude = set(string.punctuation)

In [73]:
# import nltk
# nltk.download('stopwords')

In [74]:
from nltk.stem.wordnet import WordNetLemmatizer
lemma =  WordNetLemmatizer()

In [75]:

def clean(text, stop):
    text = text.rstrip()
    # Remove stopwords
    stop_free = ' '.join([word for word in text.lower().split() if ((  word not in stop) and not(word.isdigit())) ])
    
    # Remove punctuations
    punc_free = ''.join(word for word in stop_free if  (word not in exclude))
    
    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized
    

In [76]:
clean_text = []
clean(df['clean_content'][0],stop)


'investools advisory free digest trusted investment advice unsubscribe free newsletter please see issue fried sell stock gain month km rowe january index confirms bull market aloy small cap advisor earns lbix compounding return pine tree pcl undervalued high yield bank put customer first aso word sponsor top wall street watcher zacks year year gain moving best brightest wall street big money machine earned zacks five year average annual gain start outperforming long term get zacks latest stock buylist free day trial investools go zaks mtxtu zakstb investools advisory john brobst investools fried sell stock lock month km david fried know stock undervalued company management buy back share open market latest triumph pocketing impressive gain three short month selling four buyback stock include gain auto retailer automation incorporated gain digital phone system purveyor inter tel intl fried recent move buy kmart corporation km beleaguered discount retailer declared bankruptcy think k mar

In [83]:
for text in df['clean_content']:
    (clean(text,stop))

print(clean_text)

AttributeError: 'float' object has no attribute 'rstrip'

In [84]:
# import nltk
# nltk.download('wordnet')
len(clean_text)

354

In [85]:
#!pip install gensim
from gensim import corpora

## Create dictionary and corpus

In [86]:
# Import the packages
import gensim
from gensim import corpora

# Define the dictionary
dictionary = corpora.Dictionary(clean_text)

# Define the corpus 
corpus = [dictionary.doc2bow(text) for text in clean_text]

# Print corpus and dictionary
print(dictionary)
#print(corpus)

Dictionary(15445 unique tokens: ['account', 'accurate', 'acquiring', 'acre', 'address']...)


In [87]:
# Define the LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=5)

# Save the topics and top 5 words
topics = ldamodel.print_topics(num_words = 5)


In [88]:
# Print the results
for topic in topics:
    print(topic)

(0, '0.014*"enron" + 0.010*"company" + 0.006*"gas" + 0.005*"energy" + 0.005*"new"')
(1, '0.025*"enron" + 0.011*"company" + 0.006*"stock" + 0.005*"power" + 0.005*"message"')
(2, '0.032*"enron" + 0.013*"company" + 0.013*"said" + 0.008*"mr" + 0.005*"stock"')
(3, '0.050*"enron" + 0.030*"employee" + 0.025*"company" + 0.024*"million" + 0.019*"energy"')
(4, '0.025*"enron" + 0.019*"stock" + 0.012*"option" + 0.011*"company" + 0.009*"dynegy"')


5