# Solera Text Summarizer

In [1]:
# Import all the libraries required

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import heapq

import string
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer 

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# this value can be modified to alter then no of sentences in the summary .
# this can be taken as input and an application can be developed to produce the results.

NO_OF_SENTENCES = 7

# I have downloaded the file locally
filename = 'TASK.xlsx'

rdata_orig = pd.read_excel(filename)
rdata_orig.head()

Unnamed: 0.1,Unnamed: 0,Intoduction,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,Acnesol Gel is an antibiotic that fights bacte...,,,,,,,,
1,,Ambrodil Syrup is used for treating various re...,,,,,,,,
2,,Augmentin 625 Duo Tablet is a penicillin-type ...,,,,,,,,
3,,Azithral 500 Tablet is an antibiotic used to t...,,,,,,,,
4,,Alkasol Oral Solution is a medicine used in th...,,,,,,,,


In [3]:
#avoid unimportant columns
rdata = pd.DataFrame(rdata_orig['Intoduction'])


## Preprocessing

In [4]:
# to avoid the stopwords from the frquency (and, an, the,etc)
stopwords = nltk.corpus.stopwords.words('english')


In [5]:
def summarise(row_text):
    # print(row_text)
    # removing digits and square brackets if any
    row_text = re.sub(r'\[[0-9]*\]', ' ', row_text)
    row_text = re.sub(r'\s+', ' ', row_text)
    # Removing special characters except '.' 
    row_text = re.sub('[^a-zA-Z.]', ' ', row_text )
    row_text = re.sub(r'\s+', ' ', row_text)
    row_text = re.sub('\.', '. ', row_text )
    #The following script performs sentence tokenization:
    sentence_list = nltk.sent_tokenize(row_text)
    
    #Find Weighted Frequency of Occurrence

    word_frequencies = {}
    for word in nltk.word_tokenize(row_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    
    #Finally, to find the weighted frequency,
    #we can simply divide the number of occurances of all the words by the frequency of the most occurring word            
    
    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
        
    #Calculating Sentence Scores
    #We have now calculated the weighted frequencies for all the words.
    #Now is the time to calculate the scores for each sentence by adding weighted frequencies of the words that occur in that particular sentence.
    #The following script calculates sentence scores:
    
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
                        
    
    summary_sentences = heapq.nlargest(NO_OF_SENTENCES, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)
    
    return (summary)

In [11]:
#Formulating the result
rdata_summarised = rdata.copy()
rdata_summarised['Summary'] = ''
rdata_summarised

Unnamed: 0,Intoduction,Summary
0,Acnesol Gel is an antibiotic that fights bacte...,
1,Ambrodil Syrup is used for treating various re...,
2,Augmentin 625 Duo Tablet is a penicillin-type ...,
3,Azithral 500 Tablet is an antibiotic used to t...,
4,Alkasol Oral Solution is a medicine used in th...,
...,...,...
995,Azapure Tablet belongs to a group of medicines...,
996,Arimidex 1mg Tablet is used alone or with oth...,
997,Arpimune ME 100mg Capsule is used to prevent y...,
998,Amlodac CH Tablet is a combination medicine us...,


In [12]:
# apply the fnction for the whole dataset
i =0
while (i< len(rdata)):
    rdata_summarised['Summary'][i] = summarise(rdata['Intoduction'][i])
    i = i + 1

In [14]:
#This will create the summary file in the same folder which can be altered as required.

rdata_summarised.to_excel("Summary.xlsx")