# Medium Daily Digest Summarizer

Gets all of your daily digest emails from medium and summarizing each article within them! Have all of your articles summarized while you fix youself a cup of coffee :^)

In [134]:
import imaplib
import email
import smtplib

import sys

import pandas as pd
import numpy as np
import random

import requests
from newspaper import Article, ArticleException, news_pool

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import heapq

import re
from datetime import date

### Scraping emails

In [2]:
#user should be your email address in the form "----@gmail.com"
user = 'david.eric.lopez@gmail.com'

#password is ideally an app password to maintain account security
#instructions: https://support.google.com/accounts/answer/185833?hl=en
password = 'vtbx sthy zzsm yodf'

#imap url for gmail
imap_url = 'imap.gmail.com'

In [3]:
def get_body(msg):
    
    
    #if nested, apply function until you get to the content
    if msg.is_multipart():
        return get_body(msg.get_payload(0))
    
    #return content
    else:
        return msg.get_payload(None, True)

In [4]:
def search(key, value, con):
    
    #search for key value pairs
    result, data = con.search(None, key, '"{}"'.format(value))
    
    return data

In [5]:
def get_emails(result_bytes):
    
    #get list of emails under a particular label
    #in this case, the inbox
    #stored inside a list
    msgs = []
    
    for num in result_bytes[0].split():
        
        typ, data = con.fetch(num, '(RFC822)')
        msgs.append(data)
    
    return msgs

In [6]:
#logging in with credentials and accessing emails in the inbox
#note: this will include ALL emails in the inbox, not just those seen in the section labeled "Primary"
#(you might have more emails in your inbox than you think)
con = imaplib.IMAP4_SSL(imap_url, 993)
con.login(user, password)
con.select('Inbox')

('OK', [b'531'])

In [7]:
#getting emails from medium
msgs =  get_emails(search('FROM', 'noreply@medium.com', con))

In [8]:
#extracting information contained between parentheses
#aka article links
p1 = []
for msg in msgs[::-1]:
    for sent in msg:
        p1.append(re.findall('\(([^)]+)', str(sent)))

### Getting article links

In [9]:
#filtering out unwanted links
check = []

for lst in p1:
    
    for string in lst:
        match_lst = re.findall('.*(?:\/.*){4}', str(string))
        
        for val in match_lst:
            
            if (len(val) > 1) and ('https://medium.com/' in val):
                check.append(val)

In [10]:
links = []

for val in check:
     
    #remove everything after '?'
    #remove '=\\r\\n' from links       
    val = val.replace ('=\\r\\n', '')
    val = re.sub('[?].*','', val)
    
    #removing special cases
    if (len(val) > 45) and ('E2=80=A6' not in val) and ('api/requests/' not in val):
    
        #link stored in a new list
        links.append(val)

In [32]:
articles = [Article(link, fetch_images = False) for link in links]

news_pool.set(articles, threads_per_source = 6)

news_pool.join()

for i in range(0, len(articles)):
    articles[i].parse()

In [49]:
articles[1].parse()
articles[1].title

'K-Means clustering from Scratch'

### Scraping articles

In [51]:
title = []
author = []
published = []
body = []

#downloading articles
#multi-threading to be nicer to medium
articles = [Article(link, fetch_images = False) for link in links]
news_pool.set(articles, threads_per_source = 6)
news_pool.join()

#getting title, author, publish date, and text body for each article
for i in range(0, len(articles)):
    
    try:
        articles[i].parse()
    
    except ArticleException:
        pass
    
    #appending each to the corresponding list
    title.append(articles[i].title)
    author.append(articles[i].authors)
    published.append(articles[i].publish_date)
    body.append(articles[i].text)

In [52]:
#putting together the dataframe
df = pd.DataFrame({'Link': links, 'Author':author, 'Title':title, 'Published':published, 'Body':body})

### Cleaning text

In [53]:
def body_wash(string, punct = False):
    
    #removing line breaks, digits, and empty space
    string = string.replace('\n\n', ' ')
    string = re.sub(r'\[[0-9]*\]', ' ', string)
    string = re.sub(r'\s+', ' ', string)
    
    if punct:
        
        #removes punctuation
        string = re.sub(r'[^a-zA-Z]', ' ', string)
        
        return string
    
    else:

        return string

In [57]:
#cleaning the body of test
df['Body'] = df['Body'].apply(body_wash)

In [58]:
sent_lst = []

#each article represented as lists of its sentences
for body in df['Body']:
    sent_lst.append(sent_tokenize(body))

In [59]:
#body of text cleaned, with puntuation removed
formatted = list(df['Body'].apply(body_wash, punct = True))

### Summarizing articles

In [60]:
stop = stopwords.words('english')

freqs = []

#getting word frequencies for each article
for txt in formatted:
    
    #every article will get its own dictionary, containing the articles word frequencies
    word_freq = {}
    
    for word in word_tokenize(txt):
        
        if word not in stop:
            
            #adds word to the dictionary if doesnt already exists
            if word not in word_freq.keys():
                word_freq[word] = 1
                
            #otherwise just adds it to the existing count
            else:
                word_freq[word] += 1
                
    #adding each dictionary to the list           
    freqs.append(word_freq)

In [61]:
#getting the relative frequency of each word
for word_freq in freqs:
    
    #max word frequency
    max_freq = max(word_freq.values())

    for word in word_freq.keys():
        
        #dividing each word frequency by the max frequency
        word_freq[word] = (word_freq[word]/max_freq)

In [62]:
scores = []

#getting each sentences score, according to its word frequencies
for i, lst in enumerate(sent_lst):
    
    sent_scores = {}
    
    #looping through every sentence in the article
    for sent in lst:
        
        #looping through every word in the sentence
        for word in word_tokenize(sent.lower()):
            
            #if the word is a key in the word frequency dictionary corresponding to its article
            if word in freqs[i].keys():
                
                #less than 30 words in the sentence
                if len(sent.split(' ')) < 30:
                    
                    #if the sentence isnt already scored
                    if sent not in sent_scores.keys():
                        sent_scores[sent] = freqs[i][word]
                    
                    #if its already there, add the value
                    else:
                        sent_scores[sent] += freqs[i][word]
                        
    scores.append(sent_scores)
                    
                    

In [226]:
#empty list holding every summary
sums = []

#looping through each article
for sent_score in scores:
    
    #getting the 7 highest scoring sentences for each article
    summary_sent = heapq.nlargest(7, sent_score, key = sent_score.get)
    
    #joining each summary into a single string
    summary = ' '.join(summary_sent)
    
    #appending the summary
    sums.append(summary)

In [227]:
df['Summary'] = sums

#### Full Article

In [188]:
df.iloc[5,4]

'Microsoft has one of the largest C/C++ codebases in the world. All of its core products from Windows and Office to the Azure cloud run on it. Unsurprisingly, since C++ is not a memory-safe language, a lot of memory bugs popup in their codebase, and a lot of time has to be spent fixing them. Last year, Microsoft began looking at alternative programming languages that could help fix their memory safety issues. As a result of these pursuits, Microsoft has begun experimenting, and in some cases integrating, Rust into their codebase. Rust is a relatively new programming language that promises the same low-level performance of C and C++ with a feature set expected from a modern programming language. Microsoft thinks Rust has potential, and here is how they are integrating it into their products. Rewriting Windows in Rust Ok, they aren’t rewriting all of Windows in Rust just yet; but, they are interested in seeing how Rust will fit into its ecosystem. Microsoft determined that 70% of securit

#### Summarized Article

In [233]:
df.iloc[5,5]

'Rust is a relatively new programming language that promises the same low-level performance of C and C++ with a feature set expected from a modern programming language. A new programming language A little while ago, Microsoft had investigated creating their programming language inspired by Rust. The engineers were also impressed by Rust’s compiler, which they claim would’ve caught security flaws found in another one of their projects that Go’s compiler failed to find. Unsurprisingly, since C++ is not a memory-safe language, a lot of memory bugs popup in their codebase, and a lot of time has to be spent fixing them. Last year, Microsoft began looking at alternative programming languages that could help fix their memory safety issues. They do claim that the language is already “completely memory safe.” However, Microsoft remains in favor of using Rust instead. This language, dubbed Project Verona, is aimed to address memory-related bugs in systems programming.'

In [190]:
df

Unnamed: 0,Link,Author,Title,Published,Body,Summary
0,https://medium.com/@actsusanli/multi-label-tex...,[Susan Li],Multi Label Text Classification with Scikit-Learn,2018-04-23 13:09:10.199000+00:00,Multi-class classification means a classificat...,print('Number of missing comments in comment t...
1,https://medium.com/@theobotella/k-means-cluste...,[Theo Botella],K-Means clustering from Scratch,2020-05-03 23:01:47.282000+00:00,Understand one of the most powerful clustering...,"Example of a good clustering Here, clusters ar..."
2,https://medium.com/@seanjkernan/why-is-linkedi...,[Sean Kernan],Why Is LinkedIn Such a Cringefest?,2020-08-20 17:51:55.273000+00:00,Why Do Such Trash Posts Go Viral? There’s a co...,This went super-viral: Source: author via link...
3,https://medium.com/@akshayakn95/logistic-regre...,[Akshay Patel],Logistic Regression,2020-06-09 15:50:38.676000+00:00,Training and Cost Function Now we know how a l...,Image by author The cost function over the who...
4,https://medium.com/@paresh.khandelwal26/interp...,[Paresh Khandelwal],Interpretation of Box Plots,2020-06-18 07:04:17.050000+00:00,A Box plot diagram What is Boxplot? We have re...,Thumb rule to calculate outliers Outliers are ...
5,https://medium.com/@tinocaer/how-microsoft-is-...,[Tino Caer],How Microsoft Is Adopting Rust,2020-08-06 20:00:17.627000+00:00,Microsoft has one of the largest C/C++ codebas...,Rust is a relatively new programming language ...
6,https://medium.com/@andrewahutch/90s-kids-vs-k...,[Andrew Hutchinson],90s Kids vs. Kids Today,2020-08-17 16:56:01.501000+00:00,By Slackjaw Yapjaw is Medium's #1 newsletter f...,By Slackjaw Yapjaw is Medium's #1 newsletter f...
7,https://medium.com/@theonlyblackguy/why-im-no-...,[The Only Black Guy In The Office],I’m No Longer Using My White Voice at Work,2020-08-11 12:58:59.193000+00:00,I’m No Longer Using My White Voice at Work Sor...,"Lately, though, I’ve been feeling like it’s ti..."
8,https://medium.com/@davegershgorn/a-year-after...,[Dave Gershgorn],"A Year After an HR Crisis, Microsoft Employees...",2020-07-30 22:37:16.849000+00:00,"A Year After an HR Crisis, Microsoft Employees...",Two of the former employees left the company d...
9,https://medium.com/@BrianJBarth/how-a-band-of-...,[Brian J Barth],"What Killed Quayside, Sidewalk Labs' Ambitious...",2020-08-13 16:27:13.765000+00:00,The first wave of pushback against Sidewalk La...,A longtime defender of the Canadian tech indus...


### Saving to a csv

In [191]:
#including todays date
today = date.today().strftime("%b-%d-%Y")

In [192]:
#DD: Daily Digest
df.to_csv('DD_'+ today +'.csv')