In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import nltk
import re

In [2]:
from gensim.summarization import summarize, keywords
from nltk.corpus import stopwords
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
toy_text = """
A year after 2018’s teacher walkout, ripples are still being made in Oklahoma, influencing legislation and inspiring members of the OU community.  

What began with a passionate meeting held by Oklahoma teachers on March 2, 2018, transformed into a statewide walkout within a month, when on April 2, 2018 teachers collectively protested low pay and less-than-adequate education funding by “walking out” and refusing to return to their jobs.

Thousands of teachers from all across Oklahoma flooded the Oklahoma Capitol every day of the 10-day protest, holding rallies in the rotunda and raising their voices in support of funding education.

Prior to the walkout, a bill was passed — the first since the 1990 strike — that raised teacher salaries by $6,000, allocated $50 million for education funding, and raised salaries of support staff by $1,250. But for some teachers, the bill ignored demands from the Oklahoma Education Association, and on April 2, teachers flooded the Capitol voicing their dissatisfaction.
"""

In [4]:
summary = summarize('\n'.join(sent_tokenize(toy_text)), split=True, ratio=0.5)

In [5]:
summary


['Thousands of teachers from all across Oklahoma flooded the Oklahoma Capitol every day of the 10-day protest, holding rallies in the rotunda and raising their voices in support of funding education.',
 'But for some teachers, the bill ignored demands from the Oklahoma Education Association, and on April 2, teachers flooded the Capitol voicing their dissatisfaction.']

In [6]:
clean_sentences = pd.Series(toy_text.split('\n')).str.replace("[^a-zA-Z]", " ")
# clean_sentences = [s.lower() for s in clean_sentences]
clean_sentences = [s for s in clean_sentences if len(s)!=0]

In [7]:
stops = stopwords.words('english')

In [8]:
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stops])
    return sen_new
# clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [9]:
clean_sentences

['A year after      s teacher walkout  ripples are still being made in Oklahoma  influencing legislation and inspiring members of the OU community   ',
 'What began with a passionate meeting held by Oklahoma teachers on March          transformed into a statewide walkout within a month  when on April         teachers collectively protested low pay and less than adequate education funding by  walking out  and refusing to return to their jobs ',
 'Thousands of teachers from all across Oklahoma flooded the Oklahoma Capitol every day of the    day protest  holding rallies in the rotunda and raising their voices in support of funding education ',
 'Prior to the walkout  a bill was passed   the first since the      strike   that raised teacher salaries by         allocated     million for education funding  and raised salaries of support staff by         But for some teachers  the bill ignored demands from the Oklahoma Education Association  and on April    teachers flooded the Capitol voici

In [10]:
# keyedModel = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary = True)

In [None]:
lemma = WordNetLemmatizer()
sent_vector = []
v = 0
for i in clean_sentences:
    if(len(i)!=0):
        v = sum(keyedModel[lemma.lemmatize(w)] for w in i.split())/len(i.split())+0.001
    sent_vector.append(v)


In [None]:
len(clean_sentences)

In [None]:
sim_mat = np.zeros([len(clean_sentences), len(clean_sentences)])
for i in range(len(clean_sentences)):
    for j in range(len(clean_sentences)):
        if i != j:

            sim_mat[i][j] = cosine_similarity(sent_vector[i].reshape(1,v.shape[0]), sent_vector[j].reshape(1,v.shape[0]))[0,0]

In [None]:
sim_mat

In [None]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [None]:
ranked_sent = sorted(((scores[i],s) for i,s in enumerate(clean_sentences)), reverse=True)

In [None]:
ranked_sent