# Extractive Summarisation

In [1]:
# import libraries
from collections import Counter
from itertools import combinations
from math import sqrt
import matplotlib.pyplot as plt
import networkx as nx
from nltk import word_tokenize, sent_tokenize, FreqDist,pos_tag
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import RegexpTokenizer
from operator import itemgetter
import re
%matplotlib inline

In [2]:
# Convergence threshold is the maximum error in score convergence of TextRank
CONVERGENCE_THRESHOLD = 0.0001

In [3]:
# set of all nouns
NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}

In [4]:
class Document():
    '''
    The master class for our Document Summerization module.
    Incorporates all features related to Document
    '''
    
    def __init__(self, document):
        self.document = document
        self.sents = sent_tokenize(self.document)
        self.word_freq = FreqDist(clean(self.document))
        self.graph = None
        self.params = { 'thresh': 0.0
            
        }
        
                
    def __str__(self):
        return self.document
    
    
    def statistical_sim(self, sent1, sent2):
        '''
        Statistical similarity between sentences
        based on the cosine method
        Returns: float (the cosine similarity b/w sent1 and sent2)
        '''
        sent_token1 = Counter(sent1)
        sent_token2 = Counter(sent2)
        
        intxn = set(sent_token1) & set(sent_token2)
        numerator = sum([sent_token1[x] * sent_token2[x] for x in intxn])
        
        mod1 = sum([sent_token1[x]**2 for x in sent_token1.keys()])
        mod2 = sum([sent_token2[x]**2 for x in sent_token2.keys()])
        denominator = sqrt(mod1)*sqrt(mod2)
        
        if not denominator:
            return 0.0

        return float(numerator)/denominator
    
    
    def semantic_sim(self, sent1, sent2):
        '''
        A semantic similarity score between two sentences
        based on WordNet
        Returns: float (the semantic similarity measure)
        '''
        score = 0
        sent1 = [word for word in sent1 if word in NOUNS]
        sent2 = [word for word in sent2 if word in NOUNS]
        for t1 in sent1:
            for t2 in sent2:
                score += semantic_score(t1,t2)
        try:
            return score/(len(sent1 + sent2))  
        except:
            return 10000
    
    
    def construct_graph(self):
        '''
        Constructs the word similarity graph
        '''
        connected = []
        for pair in combinations(self.sents, 2):
            cpair = clean(pair[0]), clean(pair[1])
            weight = self.statistical_sim(*cpair) + \
                     self.semantic_sim(*cpair)
            connected.append((pair[0], pair[1], weight))
        self.graph = draw_graph(connected, self.params['thresh'])    

In [5]:
# Utility functions
def clean(sent):
    '''
    A utility function that returns a a list of words in a sentence
    after cleaning it. Gets rid off uppper-case, punctuations, 
    stop words, etc.
    Returns: list (a list of cleaned words in sentence)
    '''
    words =  sent.lower() 
    words = re.findall(r'\w+', words,flags = re.UNICODE | re.LOCALE) 
    imp_words = filter(lambda x: x not in stopwords.words('english'), words)
    return imp_words
        
def semantic_score(word1, word2):
    '''
    Semantic score between two words based on WordNet
    Returns: float (the semantic score between word1 and word2)
    '''
    try:
        w1 = wn.synset('%s.n.01'%(word1))
        w2 = wn.synset('%s.n.01'%(word2))
        return wn.path_similarity(w1,w2,simulate_root = False)
    except:
        return 0
    
def draw_graph(connected, thresh):
    '''
    Draws graph as per weights and puts edges if 
    weight exceed the given thresh
    Returns: networkx Graph (nodes are sentences and edges
             are statistical and semantic relationships)
    '''
    nodes = set([n1 for n1, n2, n3 in connected] + \
                [n2 for n1, n2, n3 in connected])
    G=nx.Graph()
    for node in nodes:
        G.add_node(node)
    for edge in connected:
        if edge[2] > thresh:
            G.add_edge(edge[0], edge[1],weight = edge[2])
    plt.figure(figsize=(8,8))
    pos = nx.spring_layout(G)
    #nx.draw(G,node_color='#A0CBE2', edge_color='orange',width=1,with_labels=False)
    #plt.show()
    return G
    
def textrank_weighted(graph, initial_value=None, damping=0.85):
    '''
    Calculates PageRank for an undirected graph
    Returns: A list of tuples representing sentences and respective
    scores in descending order
    '''
    if initial_value == None: initial_value = 1.0 / len(graph.nodes())
    scores = dict.fromkeys(graph.nodes(), initial_value)

    iteration_quantity = 0
    for iteration_number in xrange(100):
        iteration_quantity += 1
        convergence_achieved = 0
        for i in graph.nodes():
            rank = 1 - damping
            for j in graph.neighbors(i):
                neighbors_sum = sum([graph.get_edge_data(j, k)['weight'] for k in graph.neighbors(j)])
                rank += damping * scores[j] * graph.get_edge_data(j, i)['weight'] / neighbors_sum

            if abs(scores[i] - rank) <= CONVERGENCE_THRESHOLD:
                convergence_achieved += 1

            scores[i] = rank

        if convergence_achieved == len(graph.nodes()):
            break
    return sorted(scores.items(), key=itemgetter(1), reverse=True)

# Twitter OAuth

In [6]:
import tweepy
from tweepy import OAuthHandler
import json
from tweepy import Stream
from tweepy.streaming import StreamListener

In [7]:
# Najeeb Khan Credentials
consumer_key = '3cFJ5hLMswDJOwXfaJGS4eNyS'
consumer_secret = 'GRn1itnKbrCdAvHhbpjFgzWXMHePNuhDYz5scjnyhD8fC3Bnqg'
access_token = '277893116-tcAWnc62SVdSBLUIqF5R5h92qm2Y0epfQUQJNa4l'
access_secret = 'S47f26iaI7L0Z5AyT2NQqtsmitrMn70nZG2H5n5dyqM4C'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

In [9]:
api = tweepy.API(auth)
trend = api.trends_place(23424977)

In [11]:
count = 0
hashtags = []
for i in trend[0]['trends']:
    hashtags.append(i['name'])
    count += 1
print count

50


In [12]:
tweets = {}
for tag in hashtags[0:10]:
    text = list()
    Tweets = tweepy.Cursor(api.search, q=tag).items(10)
    for tweet in Tweets:
        txt = (tweet.text).encode('ascii','ignore')
        text.append(txt)
    tweets[tag] = text

In [19]:
## THIS IS JUST AN EXAMPLE
tweets = {u'U.K.': ['RT @StreetEtiquette: China is looking at the U.K like : https://t.co/f5m4bePTNF', 'RT @markets: U.S. stock markets fall sharply at the opening bell https://t.co/JxC86DGgyT https://t.co/u6qZWRT3UN', 'RT @marcusbrig: The British Press have betrayed the people of the U.K. They lie to us day after day &amp; fill this country with hate &amp; fear. T', 'RT @WSJ: What the #Brexit vote means for Indian companies with ties to the U.K. https://t.co/h8sJt1AzyL via @WSJIndia https://t.co/gh53McxU', 'AKB', 'RT @TIME: The U.K. is googling what the E.U. is hours after it voted to leave https://t.co/iObpp1tmWt', 'RT @CronkiteSays: BREAKING NEWS \nThe U.K. Voted #Brexit to take their country back now it is time for the USA to do the same by voting the', 'RT @rami_kiwan: Will a #Brexit tear the U.K. or the E.U. apart first?\n#EUref #Bremain #UKreferendum #UKDecides', 'RT @thefader: #StormzyForPrimeMinister is a U.K. political campaign you can actually believe in. https://t.co/D93xKiRqfl https://t.co/KfLUR', 'RT @chimenedutoit: @K_myprecious_G Only a pleasure, thank u 4 following me to. I luv chatting 2 all the lovelies around the world who love'], u'#TakeYourDogToWorkDay': ['RT @MLBGIFs: 26th man. #TakeYourDogToWorkDay https://t.co/C573N4OaEQ', 'RT @MLBGIFs: 26th man. #TakeYourDogToWorkDay https://t.co/C573N4OaEQ', 'RT @MLBGIFs: 26th man. #TakeYourDogToWorkDay https://t.co/C573N4OaEQ', 'Is your four legged friend with you today?!  #TakeYourDogToWorkDay https://t.co/WVayvMmlCN', 'RT @DanAndShay: Dogs building some sick beats on #TakeYourDogToWorkDay... https://t.co/hBppUHNxud', 'RT @MLBGIFs: 26th man. #TakeYourDogToWorkDay https://t.co/C573N4OaEQ', 'RT @DanAndShay: Dogs building some sick beats on #TakeYourDogToWorkDay... https://t.co/hBppUHNxud', 'RT @TheUPSStore3909: Happy #TakeYourDogToWorkDay! Retweet this if you have your four-legged friend at the office today! https://t.co/Y7TfIk', '#TakeYourDogToWorkDay https://t.co/0MyEmhLphe', 'RT @MLBGIFs: 26th man. #TakeYourDogToWorkDay https://t.co/C573N4OaEQ'], u'Hillary Clinton in November': ['RT @Fusion: BREAKING Bernie Sanders says he will vote for Hillary Clinton in November. https://t.co/Q4jjZA0UBK', 'RT @WorldForBernie: We will NEVER SUPPORT Hillary Clinton in November even if @BernieSanders asked us to! @TheDemocrats have been warned ht', 'RT @trumpy17: Hillary Clinton in November,\nNope, Will Not Vote For Her!! NOPE, NOPE, NOPE!!!!!!!! #NeverHillary https://t.co/8jt2sEBxio', "RT @JamilSmith: Bernie Sanders, during an interview this morning on @MSNBC, says that he'll vote for @HillaryClinton in November. https://t", 'RT @Fusion: BREAKING Bernie Sanders says he will vote for Hillary Clinton in November. https://t.co/Q4jjZA0UBK', 'RT @WorldForBernie: We will NEVER SUPPORT Hillary Clinton in November even if @BernieSanders asked us to! @TheDemocrats have been warned ht', "RT @SOMEXlCAN: Bernie Sanders says he will vote for Democratic rival Hillary Clinton in November's presidential election https://t.co/Fv0Qb", 'RT @WorldForBernie: We will NEVER SUPPORT Hillary Clinton in November even if @BernieSanders asked us to! @TheDemocrats have been warned ht', 'RT @thehill: BREAKING: Bernie Sanders says he will vote for Hillary Clinton in November https://t.co/nEF03fACyL https://t.co/rEWHOXBGmN', "RT @PoliticsGhost: #BernieSanders says he's voting for Hillary Clinton in November! Do u #Bernie supporters #FeelTheBern now? Betrayed by #"], u'Alan Greenspan': ["RT @TeflonGeek: Alan Greenspan on market reax #Brexit: 'This has a corrosive effect that will not go away,' says WORSE than 1987, 2008 cris", "RT @TeflonGeek: Alan Greenspan on market reax #Brexit: 'This has a corrosive effect that will not go away,' says WORSE than 1987, 2008 cris", "OMG!! - Alan Greenspan says British break from EU 'is just the tip of the iceberg' https://t.co/26AL9VHznq", 'RT @NotAllBhas: embarrassing to live in a country where Henry Kissinger &amp; Alan Greenspan are not in prison but lauded as experts https://t.', "Alan Greenspan says British break from EU 'is just the tip of the iceberg' https://t.co/0gnyb1xjjl", "Alan Greenspan says British break from EU 'is just the tip of the iceberg' https://t.co/sCdN6azltr", "RT @BlanchardGold: Alan #Greenspan says British break from EU 'is just the tip of the iceberg' https://t.co/HPnc7u21He #Brexit", "Alan Greenspan Calls #Brexit 'the Tip of the Iceberg,' Says Euro is Failing https://t.co/Rgzk3P2esj via @nbcnews", 'Former Chairman of the Federal Reserve Alan Greenspan says this is just the beginning.', "Alan #Greenspan says British break from EU 'is just the tip of the iceberg' https://t.co/HPnc7u21He #Brexit"], u'#fridayreads': ['RT @DuncanWhitehead: "With over 700 5* reviews, it\'s already a classic!"- The Georgia Enquirer https://t.co/tNEecWJu1m #bookreview #KindleD', 'First book of the summer is THE GIRL ON THE TRAIN by Paula Hawkins. #fridayreads', "RT @BarefootBooks: How will you ignite your child's love of reading this weekend? #fridayreads #literacy #kidlit https://t.co/H1DbCTpOuI", 'RT @DuncanWhitehead: "With over 700 5* reviews, it\'s already a classic!"- The Georgia Enquirer https://t.co/tNEecWJu1m #bookreview #KindleD', 'Sometimes it takes reading a different perspective for some idea inspiration: https://t.co/05HZMZPG6p #fridayreads https://t.co/LPjoNzQ9d7', "RT @GeneNash: OMG. It's a real book. https://t.co/x5U0BXFlAU #FridayReads https://t.co/mrgR7xWcYC", 'RT @TwBookClub: BOOK OF THE DAY June 24\n\nOnly 99 cents: https://t.co/kc1Z1rpZ30\n\n#FridayReads #Adventure #Action #BOTD\n\n@Foxwarepub https:/', "RT @GeneNash: OMG. It's a real book. https://t.co/x5U0BXFlAU #FridayReads https://t.co/mrgR7xWcYC", '#fridayreads https://t.co/blwkHmgUKK', "RT @GeneNash: OMG. It's a real book. https://t.co/x5U0BXFlAU #FridayReads https://t.co/mrgR7xWcYC"], u'David Cameron': ['RT @TimUrbanMusic: So England voted to leave EU and David Cameron is resigning. I guess we are all just waiting to see how that will effect', 'RT @starbex: Holy shit, they replaced David Cameron with a cat. https://t.co/0r7KouIHW1', 'RT @AlbertoRT51: Ministro britnico David Cameron anuncia su renuncia y abandonar el cargo en octubre  https://t.co/y8JMs32B0v https://t.c', 'RT @NottsFacts: David Cameron is now the bookies favourite to become the new #NFFC manager #nottsfacts #euref', 'RT @TheEconomist: David Cameron quits Downing Street with a ruined legacy https://t.co/eBhcKF4Ap0 https://t.co/4r4IpfFeKa', 'RT @PHammondMP: Looking forward to attending @NATO Summit in #Warsaw w/ @David_Cameron in July. UK will continue to take a leading role in', "RT @jacklang: You know it's bad when you're not celebrating David Cameron resigning", '@Rodpac @David_Cameron  Es  Democracia !', "RT @dannyjpg: you know things are fucked when you're gutted David Cameron resigned", "never thought I'd say this but i'm gonna miss you @David_Cameron God help us if Boris becomes pm"], u'#MusicIn5Words': ['It helped me stay alive. #MusicIn5Words ', "#MusicIn5Words Can't Beat The 60s Music! ", 'which gives me total peace  #MusicIn5Words https://t.co/EI4nSI0rp8', 'A song for any occasion #MusicIn5Words', 'RT @Joyannah73: Hello from the other side! #MusicIn5Words\n@ZenRand  https://t.co/xwnxnvYJEM', "RT @musicaltrees: #MusicIn5Words \nLife's never nightmare with Music.", 'Undemanding contact in helpless solitude  #MusicIn5Words', 'It soothes your pain deeply.\n #MusicIn5Words', 'What key is this in? #MusicIn5Words @fifthlawband', 'Perfect for all my emotions. #MusicIn5Words'], u'#BrexitVote': ['RT @aftrevino93: Thoughts on #BrexitVote https://t.co/BInWPr4ITe', '#EUreferendum #EURefResults #Brexit #BrexitVote #IndependenceDay # https://t.co/gwz6K8cwtA', 'RT @Phil_Lewis_: Never seen a tweet that sums up humanity so well. #BrexitVote https://t.co/1S5nVlbmlT', 'RT @HillaryClinton: "This time of uncertainty only underscores the need for calm, steady, experienced leadership in the White House." Hill', 'RT @alitasizuka: Jajajajaja #BrexitVote https://t.co/mx5t6IUSWt', 'RT @postsecret: .#BrexitVote \n75% of British students voted to remain.\n61% of British seniors voted to leave. https://t.co/xhFP9BjmEJ', 'RT @HillaryClinton: "This time of uncertainty only underscores the need for calm, steady, experienced leadership in the White House." Hill', 'RT @BBCBreaking: UK Prime Minister David Cameron says he will step down by October https://t.co/jo2sWrbZxk #EUref #BrexitVote', 'RT @MarkQuinlin: A country that once colonized half of the world is complaining about immigration. #BrexitVote', 'RT @elisabethlehem:  #BrexitVote https://t.co/FbltLG5BwJ'], u'Shaun Suisham': ['RT @MarkKaboly_Trib: Shaun Suisham statement on his release provided to the Trib https://t.co/4X5RRaOf3S', 'Steelers release veteran kicker Shaun Suisham https://t.co/j4TGxwLC1g via @PittsburghPG', 'RT @AdamSchefter: Steelers released kicker Shaun Suisham with the failed physical designation.', 'RT @NFL: The @steelers are going in another direction at kicker: https://t.co/ulzK33MWdr https://t.co/zqjdromJBj', 'RT @RapSheet: The #Steelers released kicker Shaun Suisham with a failed physical designation. He suffered a setback in his recovery from a', 'RT @caplannfl: #Steelers released K Shaun Suisham (failed physical). Team said he had a setback in recovery from knee surgery.', 'RT @NFL: The @steelers are going in another direction at kicker: https://t.co/ulzK33MWdr https://t.co/zqjdromJBj', 'RT @Steelersdepot: #Steelers GM Kevin Colbert on the release of kicker Shaun Suisham: https://t.co/lXTn27BMQ4', 'The #Steelers have released K Shaun Suisham; made 124 of his 141 FG attempts (87.9%) w/ Pittsburgh #SteelersNation', 'RT @MarkKaboly_Trib: Shaun Suisham statement on his release provided to the Trib https://t.co/4X5RRaOf3S'], u'#EURefResults': ['RT @bilexuality: The UK Today: A Summary \n\n#EURefResults https://t.co/PA5bk3YNbU', 'RT @Number10gov: PM: I would reassure markets and investors that Britains economy is fundamentally strong. #EURefResults https://t.co/MhFv', 'RT @GoogleTrends: "What is the EU?" is the second top UK question on the EU since the #EURefResults were officially announced https://t.co/', 'RT @5carab: I guess British pensioners hate foreigners more than they love their grand children. #EURefResults', 'The U.K. Needs younger politicians who believe and act for the greatness of the country now more than ever #EURefResults #LabourInForBritain', '#Brexit #EURefResults https://t.co/FlYIJh1auw', 'RT @Charles_HRH: United Kingdom and the European Union just went from married to divorced, and unfriended. Awkward. #EURefResults https', 'RT @bilexuality: The UK Today: A Summary \n\n#EURefResults https://t.co/PA5bk3YNbU', 'RT @HogwartsLogic: The Ministry has fallen. Cameron has resigned. They are coming. #EURefResults', 'RT @GoogleTrends: "What is the EU?" is the second top UK question on the EU since the #EURefResults were officially announced https://t.co/']}

In [20]:
for tag in tweets:
    print tag
    print '----------------------------------------------------------'
    for tweet in tweets[tag]:
        print tweet
    print '==========================================================='

#fridayreads
----------------------------------------------------------
RT @DuncanWhitehead: "With over 700 5* reviews, it's already a classic!"- The Georgia Enquirer https://t.co/tNEecWJu1m #bookreview #KindleD
First book of the summer is THE GIRL ON THE TRAIN by Paula Hawkins. #fridayreads
RT @BarefootBooks: How will you ignite your child's love of reading this weekend? #fridayreads #literacy #kidlit https://t.co/H1DbCTpOuI
RT @DuncanWhitehead: "With over 700 5* reviews, it's already a classic!"- The Georgia Enquirer https://t.co/tNEecWJu1m #bookreview #KindleD
Sometimes it takes reading a different perspective for some idea inspiration: https://t.co/05HZMZPG6p #fridayreads https://t.co/LPjoNzQ9d7
RT @GeneNash: OMG. It's a real book. https://t.co/x5U0BXFlAU #FridayReads https://t.co/mrgR7xWcYC
RT @TwBookClub: BOOK OF THE DAY June 24

Only 99 cents: https://t.co/kc1Z1rpZ30

#FridayReads #Adventure #Action #BOTD

@Foxwarepub https:/
RT @GeneNash: OMG. It's a real book. https://t.co/x5U

In [21]:
extract_tweets = {}

In [30]:
for tag in tweets:
    string = '\n'.join(tweets[tag])
    a = Document(string)
    a.construct_graph()
    x = textrank_weighted(a.graph)
    text = []
    for i in x[:3]:
        text.append(i[0])
    extract_tweets[tag] = text

<matplotlib.figure.Figure at 0x7f4e9a37c5d0>

<matplotlib.figure.Figure at 0x7f4e9cc33990>

<matplotlib.figure.Figure at 0x7f4e9cc08b50>

<matplotlib.figure.Figure at 0x7f4e9a3bcb90>

<matplotlib.figure.Figure at 0x7f4e9a3bcfd0>

<matplotlib.figure.Figure at 0x7f4e9a3bcd90>

<matplotlib.figure.Figure at 0x7f4e9a3ca050>

<matplotlib.figure.Figure at 0x7f4e9a3cac10>

<matplotlib.figure.Figure at 0x7f4e9a3caf90>

<matplotlib.figure.Figure at 0x7f4e9a3dd850>

In [31]:
for tag in extract_tweets:
    print tag
    print len(extract_tweets[tag])
    for tweet in extract_tweets[tag]:
        print tweet
        print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
    print '--------------------------------------------------------------'

U.K.
3
T
RT @WSJ: What the #Brexit vote means for Indian companies with ties to the U.K. https://t.co/h8sJt1AzyL via @WSJIndia https://t.co/gh53McxU
AKB
RT @TIME: The U.K. is googling what the E.U.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RT @StreetEtiquette: China is looking at the U.K like : https://t.co/f5m4bePTNF
RT @markets: U.S. stock markets fall sharply at the opening bell https://t.co/JxC86DGgyT https://t.co/u6qZWRT3UN
RT @marcusbrig: The British Press have betrayed the people of the U.K.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
is hours after it voted to leave https://t.co/iObpp1tmWt
RT @CronkiteSays: BREAKING NEWS 
The U.K. Voted #Brexit to take their country back now it is time for the USA to do the same by voting the
RT @rami_kiwan: Will a #Brexit tear the U.K. or the E.U.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--------------------------------------------------------------
#TakeYourDogToWorkDay
3
#TakeYourDogToWorkDay ht