# Extractive Summarisation

In [76]:
# import libraries
from collections import Counter
from itertools import combinations
from math import sqrt
import matplotlib.pyplot as plt
import networkx as nx
from nltk import word_tokenize, sent_tokenize, FreqDist,pos_tag
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import RegexpTokenizer
from operator import itemgetter
import re
%matplotlib inline

In [77]:
# Convergence threshold is the maximum error in score convergence of TextRank
CONVERGENCE_THRESHOLD = 0.0001

In [78]:
# set of all nouns
NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}

In [217]:
class Document():
    '''
    The master class for our Document Summerization module.
    Incorporates all features related to Document
    '''
    
    def __init__(self, document):
        self.document = document
        #self.sents = sent_tokenize(self.document)
        self.sents = self.document.split('\n')
        self.word_freq = FreqDist(clean(self.document))
        self.graph = None
        self.params = { 'thresh': 0.0 }
                
    def __str__(self):
        return self.document
    
    
    def statistical_sim(self, sent1, sent2):
        '''
        Statistical similarity between sentences
        based on the cosine method
        Returns: float (the cosine similarity b/w sent1 and sent2)
        '''
        sent_token1 = Counter(sent1)
        sent_token2 = Counter(sent2)
        
        intxn = set(sent_token1) & set(sent_token2)
        numerator = sum([sent_token1[x] * sent_token2[x] for x in intxn])
        
        mod1 = sum([sent_token1[x]**2 for x in sent_token1.keys()])
        mod2 = sum([sent_token2[x]**2 for x in sent_token2.keys()])
        denominator = sqrt(mod1)*sqrt(mod2)
        
        if not denominator:
            return 0.0

        return float(numerator)/denominator
    
    
    def semantic_sim(self, sent1, sent2):
        '''
        A semantic similarity score between two sentences
        based on WordNet
        Returns: float (the semantic similarity measure)
        '''
        score = 0
        sent1 = [word for word in sent1 if word in NOUNS]
        sent2 = [word for word in sent2 if word in NOUNS]
        for t1 in sent1:
            for t2 in sent2:
                score += semantic_score(t1,t2)
        try:
            return score/(len(sent1 + sent2))  
        except:
            return 10000
    
    
    def construct_graph(self):
        '''
        Constructs the word similarity graph
        '''
        connected = []
        for pair in combinations(self.sents, 2):
            cpair = clean(pair[0]), clean(pair[1])
            weight = self.statistical_sim(*cpair) + \
                     self.semantic_sim(*cpair)
            connected.append((pair[0], pair[1], weight))
        self.graph = draw_graph(connected, self.params['thresh'])    

In [218]:
# Utility functions
def clean(sent):
    '''
    A utility function that returns a a list of words in a sentence
    after cleaning it. Gets rid off uppper-case, punctuations, 
    stop words, etc.
    Returns: list (a list of cleaned words in sentence)
    '''
    words =  sent.lower() 
    words = re.findall(r'\w+', words,flags = re.UNICODE | re.LOCALE) 
    imp_words = filter(lambda x: x not in stopwords.words('english'), words)
    return imp_words
        
def semantic_score(word1, word2):
    '''
    Semantic score between two words based on WordNet
    Returns: float (the semantic score between word1 and word2)
    '''
    try:
        w1 = wn.synset('%s.n.01'%(word1))
        w2 = wn.synset('%s.n.01'%(word2))
        return wn.path_similarity(w1,w2,simulate_root = False)
    except:
        return 0
    
def draw_graph(connected, thresh):
    '''
    Draws graph as per weights and puts edges if 
    weight exceed the given thresh
    Returns: networkx Graph (nodes are sentences and edges
             are statistical and semantic relationships)
    '''
    nodes = set([n1 for n1, n2, n3 in connected] + \
                [n2 for n1, n2, n3 in connected])
    G=nx.Graph()
    for node in nodes:
        G.add_node(node)
    for edge in connected:
        if edge[2] > thresh:
            G.add_edge(edge[0], edge[1],weight = edge[2])
    #plt.figure(figsize=(8,8))
    #pos = nx.spring_layout(G)
    #nx.draw(G,node_color='#A0CBE2', edge_color='orange',width=1,with_labels=False)
    #plt.show()
    return G
    
def textrank_weighted(graph, initial_value=None, damping=0.85):
    '''
    Calculates PageRank for an undirected graph
    Returns: A list of tuples representing sentences and respective
    scores in descending order
    '''
    if initial_value == None:
        initial_value = 1.0 / len(graph.nodes())
    scores = dict.fromkeys(graph.nodes(), initial_value)

    iteration_quantity = 0
    for iteration_number in xrange(100):
        iteration_quantity += 1
        convergence_achieved = 0
        for i in graph.nodes():
            rank = 1 - damping
            for j in graph.neighbors(i):
                neighbors_sum = sum([graph.get_edge_data(j, k)['weight'] for k in graph.neighbors(j)])
                rank += damping * scores[j] * graph.get_edge_data(j, i)['weight'] / neighbors_sum

            if abs(scores[i] - rank) <= CONVERGENCE_THRESHOLD:
                convergence_achieved += 1

            scores[i] = rank

        if convergence_achieved == len(graph.nodes()):
            break
    return sorted(scores.items(), key=itemgetter(1), reverse=True)

# Twitter OAuth

In [12]:
import tweepy
from tweepy import OAuthHandler
import json
from tweepy import Stream
from tweepy.streaming import StreamListener
import re

In [13]:
def tweet_cleaner(tweet):
    return ' '.join(re.sub("(RT @[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|@[A-Za-z0-9]+"," ",tweet).split())

In [204]:
def find_id(tweets,tag,tweet):
    for t in tweets[tag]:
        if tweet == (t[0]):
            return t[1]
    return None

In [2]:
# Najeeb Khan Credentials
consumer_key = '3cFJ5hLMswDJOwXfaJGS4eNyS'
consumer_secret = 'GRn1itnKbrCdAvHhbpjFgzWXMHePNuhDYz5scjnyhD8fC3Bnqg'
access_token = '277893116-tcAWnc62SVdSBLUIqF5R5h92qm2Y0epfQUQJNa4l'
access_secret = 'S47f26iaI7L0Z5AyT2NQqtsmitrMn70nZG2H5n5dyqM4C'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

In [3]:
api = tweepy.API(auth)
trend = api.trends_place(23424977)

In [163]:
def get_trends(trend):
    return [(i['name']) for i in trend[0]['trends']]

In [168]:
hashtags = get_trends(trend)
print hashtags

[u'#GrowingUpWithLenientParents', u'#dotheboogaloo', u'#HeterosexualPrideDay', u'#BeingAsian', u'#BadMonsterMovies', u'Scotty Moore', u'Tyler Miller', u'Joe Maddon', u'Ryan Madson', u'Candace Parker', u'Brandon Crawford', u'Doolittle', u'4 Levels To Party', u'Javier Lopez', u'Project X', u'Bochy', u'Tim Lincecum', u'Eddie Butler', u'Arizona 5-4', u'Julio Urias', u'Michy Batshuayi', u'Melvin Upton', u'Nelson Cruz', u'Toronto-area', u'#lovesickmemories', u'#ThingsIWillAlwaysBe', u'#EthansHair', u'#MakeAmericaHornyAgain', u'#MTVScream', u'#GrowingUpHispanic', u'#WednesdayWisdom', u'#WhyIDontGetInvitedAnymore', u'#CheerGirlsBreakTheInternet', u'#pettyhour', u'#DeleteBeforeSunrise', u'#ThankYouLaurenFor', u'#BayBridgeSeries', u'#electionin6words', u'#SocialSoiree', u'#ZackAttack', u'#ZooCBS', u'#1lineWed', u'#ALDUBIYAMin14Days', u'#FinFabulousBET', u'#NoEraPenal', u'#ToTellTheTruth', u'#weirdbaseball', u'#MiercolesDeGanarSeguidores', u'#BLACKPINK', u'#TourGroup']


In [190]:
def get_tweets(hashtags = [],nb_tweets = 10):
    tweets = {}
    for tag in hashtags[0:10]:
        Tweets = tweepy.Cursor(api.search, q=tag).items(10)
        text = [(tweet_cleaner((tweet.text).encode('ascii','ignore')) + '.',tweet.id) for tweet in Tweets]
        tweets[tag] = text
    return tweets

In [191]:
tweets = get_tweets(hashtags,10)

In [192]:
for tag in tweets:
    print tag
    print '----------------------------------------------------------'
    for tweet in tweets[tag]:
        print tweet
    print '==========================================================='

Tyler Miller
----------------------------------------------------------
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748106487571025921)
('Tyler Miller tymiller01 is now trending in Seattle.', 748104468776902657)
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748103576942051330)
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748101111995830272)
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748096606126301184)
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748095242478424064)
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748092009164922881)
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748078928082804736)
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748078651946610688)
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748078148168744961)
Candace Parker
------------------------------------

In [219]:
def get_top_tweets(tweets = {},top = 3):
    extract_tweets = {}
    for tag in tweets:
        temp = [tweet[0] for tweet in tweets[tag]]
        string = '\n'.join(temp)
        a = Document(string)
        a.construct_graph()
        x = textrank_weighted(a.graph)
        extract_tweets[tag] = [(i[0],find_id(tweets,tag,i[0])) for i in x[:top]]
    return extract_tweets

In [220]:
extract = get_top_tweets(tweets,3)

In [221]:
for tag in extract:
    print tag
    for tweet in extract[tag]:
        print tweet

Tyler Miller
('TYLER FREAKIN MILLER Enjoy the penalty shootout in its entirety.', 748106487571025921)
('Tyler Miller tymiller01 is now trending in Seattle.', 748104468776902657)
Candace Parker
('Candace Parker s postgame interview with Holly Rowe about Pat Summitt.', 748110742516203520)
('Watch Candace Parker give a very emotional post game interview about Pat Summitt.', 748110754902073344)
('Awesome emotional interview between Holly Rowe and Candace Parker speaking on Pat Summitt.', 748110757817163776)
#HeterosexualPrideDay
('HeterosexualPrideDay.', 748110675109679104)
('I fucking hate y all HeterosexualPrideDay.', 748110674375618561)
('En rouge les pays o l htrosexualit est un crime HeterosexualPrideDay.', 748110673981353984)
Joe Maddon
('Joe Maddon living out Joe Maddon fanfic IRL tonight.', 748107256261009408)
('joe maddon is the best.', 748107062886858752)
('Joe maddon is a sorcerer.', 748107399853015040)
Scotty Moore
('RIP Scotty Moore Elvis guitarist who has passed at 84.', 7481

In [222]:
final = get_top_tweets(extract,1)

In [223]:
for tag in final:
    for tweet in final[tag]:
        print tweet

('Tyler Miller tymiller01 is now trending in Seattle.', 748104468776902657)
('Awesome emotional interview between Holly Rowe and Candace Parker speaking on Pat Summitt.', 748110757817163776)
('HeterosexualPrideDay.', 748110675109679104)
('Joe Maddon living out Joe Maddon fanfic IRL tonight.', 748107256261009408)
('Scotty Moore Rock Pioneer and Elvis Presleys Guitarist Dies at 84.', 748110640481329152)
('Ryan Madson Bumpy last innings in SF finally tip to A s MLB.', 748096128869105664)
('Here s an original song for you guys Enjoy dotheboogaloo.', 748110714590531584)
('Not Feratu BadMonsterMovies.', 748110633367810048)
('GrowingUpWithLenientParents when your friends told you they got grounded but you couldn t relate.', 748110706483036160)
('beingasian is having ppl tell u ur food smells bad ur language sounds funny but saying ur culture is awesome when it fit.', 748110733477445633)
