In [13]:
import bs4 as bs  
import urllib.request  
import re
import nltk

scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Toy')  
article = scraped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:  
    article_text += p.text

In [14]:
# Removing Square Brackets and Extra Spaces
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)  
article_text = re.sub(r'\s+', ' ', article_text) 

In [15]:
# Removing special characters and digits
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )  
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)  

In [16]:
sentence_list = nltk.sent_tokenize(article_text)  

In [17]:
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {}  
for word in nltk.word_tokenize(formatted_article_text):  
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

In [18]:
maximum_frequncy = max(word_frequencies.values())

for word in word_frequencies.keys():  
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

In [19]:
sentence_scores = {}  
for sent in sentence_list:  
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

In [20]:
import heapq  
summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)  
print(summary)

The materials that toys are made from have changed, what toys can do has changed, but the fact that children play with toys has not. Dinky Toys pioneered the manufacture of die-cast toys with the production of toy cars, trains and ships and model train sets became popular in the 1920s. Sometimes intended as decorations, keepsakes, or collectibles for older children and adults, most dolls are intended as toys for children, usually girls, to play with. Toy companies have often promoted the segregation by gender in toys because it enables them to customize the same toy for each gender, which ultimately doubles their revenue. One example of the dramatic ways that toys can influence child development involves clay sculpting toys such as Play-Doh and Silly Putty and their home-made counterparts. Children, especially very small ones, often put toys into their mouths, so the materials used to make a toy are regulated to prevent poisoning. In 2007, massive recalls of toys produced in China led 

In [18]:
import pandas as pd
import numpy as np

'''data=pd.read_excel('data.xlsx') #Include your data file instead of data.xlsx
idea=data.iloc[:,0:1] #Selecting the first column that has text.

#Converting the column of data from excel sheet into a list of documents, where each document corresponds to a group of sentences.
corpus=[]
for index,row in idea.iterrows():
    corpus.append(row['Idea'])
Or you could just comment out the above code and use this dummy corpus list instead if you don't have the data.'''
corpus=['She went to the airport to see him off.','I prefer reading to writing.','Los Angeles is in California. Its southeast of San Francisco.','I ate a burger then went to bed.','Compare your answer with Toms.','I had hardly left home when it began to rain heavily.','If he had asked me, I would have given it to him. ','I could have come by auto, but who would pay the fare? ','Whatever it may be, you should not have beaten him.','You should have told me yesterday','I should have joined this course last year.','Where are you going?','There are too many people here.','Everyone always asks me that.','I didnt think you were going to make it.','Be quiet while I am speaking.','I cant figure out why he said so.']
    
    
#Count Vectoriser then tidf transformer

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

#vectorizer.get_feature_names()

#print(X.toarray())     

from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X)
print(tfidf.shape )                        

from sklearn.cluster import KMeans

num_clusters = 5 #Change it according to your data.
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf)
clusters = km.labels_.tolist()

idea={'Idea':corpus, 'Cluster':clusters} #Creating dict having doc with the corresponding cluster number.
frame=pd.DataFrame(idea,index=[clusters], columns=['Idea','Cluster']) # Converting it into a dataframe.

print("\n")
print(frame) #Print the doc with the labeled cluster number.
print("\n")
print(frame['Cluster'].value_counts()) #Print the counts of doc belonging to each cluster.


(17, 94)


                                                Idea  Cluster
0            She went to the airport to see him off.        0
0                       I prefer reading to writing.        0
4  Los Angeles is in California. Its southeast of...        4
0                   I ate a burger then went to bed.        0
4                     Compare your answer with Toms.        4
1  I had hardly left home when it began to rain h...        1
1  If he had asked me, I would have given it to h...        1
1  I could have come by auto, but who would pay t...        1
3  Whatever it may be, you should not have beaten...        3
3                  You should have told me yesterday        3
3        I should have joined this course last year.        3
2                               Where are you going?        2
2                    There are too many people here.        2
1                      Everyone always asks me that.        1
2           I didnt think you were going to make it.       