# Medium Daily Digest Summarizer

Gets all of your daily digest emails from medium and summarizing each article within them! Have all of your articles summarized while you fix youself a cup of coffee :^)

In [1]:
import imaplib
import email
from newspaper import Article, ArticleException, news_pool

import pandas as pd
import numpy as np

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import heapq

from datetime import date

### Scraping emails

In [2]:
#user should be your email address in the form "----@gmail.com"
user = 'YOUR EMAIL'

#password is ideally an app password to maintain account security
#instructions on how to get your gmail app password: https://support.google.com/accounts/answer/185833?hl=en
password = 'YOUR PASSWORD'

#imap url for gmail
imap_url = 'imap.gmail.com'

In [3]:
def get_body(msg):
    
    
    #if nested, apply function until you get to the content
    if msg.is_multipart():
        return get_body(msg.get_payload(0))
    
    #return content
    else:
        return msg.get_payload(None, True)

In [4]:
def search(key, value, con):
    
    #search for key value pairs matching FROM noreply@medium.com
    result, data = con.search(None, key, '"{}"'.format(value))
    
    return data

In [5]:
def get_emails(result_bytes):
    
    #get emails under a particular label
    #in this case, the inbox
    #stored inside a list
    msgs = []
    
    ####only retrieving the latest email
    num = result_bytes[0].split()[-1]

    typ, data = con.fetch(num, '(RFC822)')
    msgs.append(data)
    
    ####if youd like to retrieve all emails, uncomment the following:
    
    #for num in result_bytes[0].split():
        #typ, data = con.fetch(num, '(RFC822)')
        #msgs.append(data)
    
    return msgs

In [6]:
#logging in with credentials and accessing emails in the inbox
#note: this will include ALL emails in the inbox, not just those seen in the section labeled "Primary"
#(you might have more emails in your inbox than you think)
con = imaplib.IMAP4_SSL(imap_url, 993)
con.login(user, password)
con.select('Inbox')

('OK', [b'453'])

In [7]:
#getting emails from medium
msgs =  get_emails(search('FROM', 'noreply@medium.com', con))

In [8]:
#extracting information contained between parentheses
#aka article links
p1 = []
for msg in msgs[::-1]:
    for sent in msg:
        p1.append(re.findall('\(([^)]+)', str(sent)))

### Getting article links

In [9]:
#filtering out unwanted links
check = []

for lst in p1:
    
    for string in lst:
        match_lst = re.findall('.*(?:\/.*){4}', str(string))
        
        for val in match_lst:
            
            if (len(val) > 1) and ('https://medium.com/' in val):
                check.append(val)

In [10]:
links = []

for val in check:
     
    #remove everything after '?'
    #remove '=\\r\\n' from links       
    val = val.replace ('=\\r\\n', '')
    val = re.sub('[?].*','', val)
    
    #removing special cases
    if (len(val) > 45) and ('E2=80=A6' not in val) and ('api/requests/' not in val):
    
        #link stored in a new list
        links.append(val)

### Scraping articles

In [11]:
title = []
author = []
published = []
body = []

#downloading articles
#multi-threading to be nicer to medium
articles = [Article(link, fetch_images = False) for link in links]
news_pool.set(articles, threads_per_source = 6)
news_pool.join()

#getting title, author, publish date, and text body for each article
for i in range(0, len(articles)):
    
    try:
        articles[i].parse()
    
    except ArticleException:
        pass
    
    #appending each to the corresponding list
    title.append(articles[i].title)
    author.append(articles[i].authors)
    published.append(articles[i].publish_date)
    body.append(articles[i].text)

In [12]:
#putting together the dataframe
df = pd.DataFrame({'Link': links, 'Author':author, 'Title':title, 'Published':published, 'Body':body})

### Cleaning text

In [13]:
def body_wash(string, punct = False):
    
    #removing line breaks, digits, and empty space
    string = string.replace('\n\n', ' ')
    string = re.sub(r'\[[0-9]*\]', ' ', string)
    string = re.sub(r'\s+', ' ', string)
    
    if punct:
        
        #removes punctuation
        string = re.sub(r'[^a-zA-Z]', ' ', string)
        
        return string
    
    else:

        return string

In [14]:
#cleaning the body of test
df['Body'] = df['Body'].apply(body_wash)

In [15]:
sent_lst = []

#each article represented as lists of its sentences
for body in df['Body']:
    sent_lst.append(sent_tokenize(body))

In [16]:
#body of text cleaned, with puntuation removed
formatted = list(df['Body'].apply(body_wash, punct = True))

### Summarizing articles

In [17]:
stop = stopwords.words('english')

freqs = []

#getting word frequencies for each article
for txt in formatted:
    
    #every article will get its own dictionary, containing the articles word frequencies
    word_freq = {}
    
    for word in word_tokenize(txt):
        
        if word not in stop:
            
            #adds word to the dictionary if doesnt already exists
            if word not in word_freq.keys():
                word_freq[word] = 1
                
            #otherwise just adds it to the existing count
            else:
                word_freq[word] += 1
                
    #adding each dictionary to the list           
    freqs.append(word_freq)

In [18]:
#getting the relative frequency of each word
for word_freq in freqs:
    
    #max word frequency
    max_freq = max(word_freq.values())

    for word in word_freq.keys():
        
        #dividing each word frequency by the max frequency
        word_freq[word] = (word_freq[word]/max_freq)

In [19]:
scores = []

#getting each sentences score, according to its word frequencies
for i, lst in enumerate(sent_lst):
    
    sent_scores = {}
    
    #looping through every sentence in the article
    for sent in lst:
        
        #looping through every word in the sentence
        for word in word_tokenize(sent.lower()):
            
            #if the word is a key in the word frequency dictionary corresponding to its article
            if word in freqs[i].keys():
                
                #less than 30 words in the sentence
                if len(sent.split(' ')) < 30:
                    
                    #if the sentence isnt already scored
                    if sent not in sent_scores.keys():
                        sent_scores[sent] = freqs[i][word]
                    
                    #if its already there, add the value
                    else:
                        sent_scores[sent] += freqs[i][word]
                        
    scores.append(sent_scores)
                    
                    

In [20]:
#empty list holding every summary
sums = []

#looping through each article
for sent_score in scores:
    
    #getting the 7 highest scoring sentences for each article
    summary_sent = heapq.nlargest(7, sent_score, key = sent_score.get)
    
    #joining each summary into a single string
    summary = ' '.join(summary_sent)
    
    #appending the summary
    sums.append(summary)

In [21]:
df['Summary'] = sums

#### Full Article

In [28]:
df.iloc[5,2]

'I Drank A Gallon Of Water A Day For Better Skin — & Here’s What Happened'

In [22]:
df.iloc[5,4]

'I Drank A Gallon Of Water A Day For Better Skin — & Here’s What Happened Refinery29 UK Follow Aug 11 · 4 min read By Maria Del Russo PHOTOGRAPHED BY TAYLER SMITH. I’m a fairly healthy person — I do what I can to feel good, while still enjoying my life. I wake up at the crack of dawn for yoga and eat a fairly balanced diet, but I’ll also snag that last slice of pizza if you don’t get to it first. But one thing I’ve been told over and over again is that I don’t drink enough water. During my checkups, my doctors would tell me the horrible migraines that left me incapacitated for days might have to do with dehydration. My derm said that my acne would likely improve if I drank more water. And every aesthetician I’ve ever been to would tsk-tsk when I told them how little water I imbibed on a daily basis. Water, it seemed, was the answer to all of my problems. You’ve likely heard similar advice. Just do a Google search for “water for skin,” and you’ll find thousands of articles on the subjec

#### Summarized Article

In [23]:
df.iloc[5,5]

'There were some days when I’d get into bed, realise that I still had some water left in my jug, and make a silent deal with whoever would listen. I’d drink a gallon of water a day for 30 days to see what it would do for my face. The next day, while out shopping with friends (water gallon in tow), I hit a wall — a tiled bathroom wall, that is. I didn’t feel the need to snack as often, my sugar cravings practically disappeared, and I didn’t get a single migraine while drinking all of this water. My derm said that my acne would likely improve if I drank more water. And every aesthetician I’ve ever been to would tsk-tsk when I told them how little water I imbibed on a daily basis. While the water didn’t make my face feel petal-soft, I did see a decrease in the number of zits dotting my chin and cheeks.'

In [24]:
df

Unnamed: 0,Link,Author,Title,Published,Body,Summary
0,https://medium.com/@salvadoraleguas/how-to-use...,[Salvador Aleguas],How to use the easiest GUI of your life in Python,2020-07-29 02:16:37.255000+00:00,Installation I love using Python’s package man...,# Adding an argument that can be called by -a ...
1,https://medium.com/@romanorac/are-you-still-us...,[Roman Orac],Are you still using Pandas for big data?,2020-08-21 05:22:48.108000+00:00,Are you still using Pandas for big data? Panda...,%%time import glob df_list = for filename in g...
2,https://medium.com/@ngwaifoong92/create-custom...,[Ng Wai Foong],Create Custom Word Clouds in Python,2020-03-19 17:10:57.804000+00:00,"2. Basic Usage First and foremost, let’s impor...",mask — input an image to be used as a mask Onc...
3,https://medium.com/@benaikumar2/types-of-cross...,[],Types of cross-validation in machine learning,2020-08-03 13:49:45.798000+00:00,Types of cross-validation in machine learning ...,Leave-One-Out Cross-Validation (LOOCV) In this...
4,https://medium.com/@venkateshprabhu/linear-reg...,[Venkatesh Prabhu],Linear Regression in Python,2019-01-12 17:57:13.388000+00:00,Artwork by Igor Kozak How great it would be if...,Cost Function Cost Function helps us in determ...
5,https://medium.com/@refinery29/i-drank-a-gallo...,[],I Drank A Gallon Of Water A Day For Better Ski...,2020-08-11 15:31:01.225000+00:00,I Drank A Gallon Of Water A Day For Better Ski...,"There were some days when I’d get into bed, re..."
6,https://medium.com/@jo879344/detecting-bad-cus...,[Jonathan Oheix],Detecting bad customer reviews with NLP,2018-12-18 14:10:06.668000+00:00,Introduction Sentiment analysis is part of the...,Highest positive sentiment reviews The most po...
7,https://medium.com/@gavinnyr30/the-chilling-ch...,[Carter Covington],The Chilling Chinese Restaurant Cover-Up,2020-08-19 05:11:15.480000+00:00,The Chilling Chinese Restaurant Cover-Up The a...,"Immediately, officers went to the restaurant a..."
8,https://medium.com/@MarcGuberti/the-3-billion-...,[Marc Guberti],The $3 Billion Portfolio With Only 2 Stocks In It,2020-08-18 18:32:06.045000+00:00,This guy could care less about diversifying yo...,You don’t have to diversify your portfolio pic...
9,https://medium.com/@andrewahutch/mid-30s-vs-mi...,[Andrew Hutchinson],Mid-30s vs. Mid-20s Millennial Women,2020-07-21 16:16:01.732000+00:00,By Slackjaw Yapjaw is Medium's #1 newsletter f...,By Slackjaw Yapjaw is Medium's #1 newsletter f...


### Saving to a csv

In [25]:
#including todays date
today = date.today().strftime("%b-%d-%Y")

In [26]:
#DD: Daily Digest
df.to_csv('DD_'+ today +'.csv')