In [1]:
import nltk
import os
import re
import logging
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
from string import punctuation
import json
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [2]:
#get file from text doc from net(.txt)
def get_text(url):
    text=requests.get(url).text
    return text
    

#get text doc from webpage
def get_text_from_webpage(url):

    """
    return the text of the article at the 
    specified url
    """

    page=urlopen(url)
    soup=BeautifulSoup(page,"html.parser")
    text=' '.join(map(lambda p: p.text, soup.find_all('p')))
    return text


#get file from text doc
def get_text_from_file(fname):
    f=open(fname,'r')
    text=f.readlines()
    text=''.join(text) #converting the list to type str
    return text


def remove_string_special_characters(s):
    """
    This function removes special characters from within a string

    param: 
        s(str): single input string.

    return: 
        stripped(str): A string with special characters removed
    """

    # Replace special character with ' '
    stripped = re.sub('[^\w\s]', '', s)
    stripped = re.sub('_', '', stripped)

    # Change any whitespace to one space
    stripped = re.sub('\s+', ' ', stripped)

    # Remove start and end white spaces
    stripped = stripped.strip()

    return stripped

def text_init_cleaning(s):
    """
    This function applies an initial cleaning to a string
    
    param: 
        s(str): single input string.
      
    return: 
        stripped(str): A string with init stage cleaning
    """
    
    stripped = re.sub('[\'\"]',"'", s)
    stripped = re.sub('\n+',' ', stripped)
    stripped = re.sub('\s+', ' ', stripped)
    
    return stripped


def get_summary(text, stop_words, sum_ratio):

    sentences = sent_tokenize(text)
    sents = [remove_string_special_characters(s) for s in sentences]
    sentences_combi = " ".join(sents)
    words = word_tokenize(sentences_combi)

    #removing stopwords
    words = [word.lower() for word in words if word not in stop_words]

    #creating a frequency table listing each word's frequency 
    freqTable = dict()
    for word in words:
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1


    #assigning a score to every sentence
    sentence_info = []
    sum_values=0

    #adding the frequency of every non-stop word to give the value for
    #that sentence
    for i in range(1,len(sents)):

        words = word_tokenize(sents[i])
        words = [word.lower() for word in words]
        count = 0
        for word in words:
            if word in freqTable:
                count += freqTable[word]
        sum_values += count

        temp = {"sentence_id": i, "sentence_text":sentences[i], "sentence_count": count}
        sentence_info.append(temp)       

    avg=sum_values/(len(sentence_info)) 
    
    output=[]
    
    for sent in sentence_info:
        if sent['sentence_count'] >= sum_ratio*avg:
            output.append(sent['sentence_text'])
    
    return(output)
   
    

In [3]:
class Clean_Text(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        
        return None

    def fit(self, X=None, y=None):
        """ 
        param X: Dummy variable
        param y: Dummy Variable
        """
        
        return self
    
    def transform(self, X=None):
        
        text_clean = text_init_cleaning(X)
        
        return(text_clean)

In [4]:
class Summarise_Text(BaseEstimator, TransformerMixin):
    
    def __init__(self, sum_ratio):
        
        self.stop_words = set(stopwords.words('english') + 
                              list(punctuation))
        self.sum_ratio = sum_ratio
        

    def fit(self, X=None, y=None):
        """ 
        param X: Dummy variable
        param y: Dummy Variable
        """
        
        return self
    
    
    def transform(self, X=None):
        
        summary = get_summary(X, self.stop_words, self.sum_ratio)  
        summary = "\n".join(summary)
        return(summary)

In [5]:
text = get_text_from_file('england_vs_colombia.txt')

# Construct Pipeline
pipeline_list = list()
pipeline_list.append(("text_cleaning", Clean_Text()))
pipeline_list.append(("text_summary", Summarise_Text(1.3)))

# If Pipeline created, fit and transform
if len(pipeline_list) > 0:
    clf = Pipeline(pipeline_list)
    clf.fit(text)

# Execute Pipeline
text_summary = clf.transform(text)
print(text_summary)

Before the game, England had won only two World Cup knockout matches since 1990 and looked to be making it three when captain Harry Kane gave them the lead from the penalty spot.
But in the third minute of stoppage time at the Spartak Stadium, Colombia defender Yerry Mina headed home to take the game into extra time and then to penalties.
'We looked at technique, how we needed to be as a team, the goalkeeper's role.
In a tempestuous game, six Colombia players were shown yellow cards, including Wilmar Barrios for what looked like a headbutt in the Colombia penalty area on midfielder Henderson.
Colombia players also surrounded referee Mark Geiger for about three minutes after he awarded a penalty for Carlos Sanchez's foul on Kane.
Colombia manager Jose Pekerman said: 'People in England or others should not think of Colombia players like this.
'There were many, many fouls in the game and I do not think we committed anywhere near the number they did.
Kane leads the race for the Golden Boot