In [1]:
file = open("data.txt","r")
text = file.read()
file.close()
#print(text)

# Tokenization

In [2]:
import nltk
nltk.download('punkt')   # one time execution
from nltk.tokenize import sent_tokenize
sentence = sent_tokenize(text)
#print(sentence)

[nltk_data] Downloading package punkt to /home/riya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Cleaning

In [3]:
import re
nltk.download('stopwords')  # one time execution
from nltk.corpus import stopwords
corpus = []
for i in range(len(sentence)):
    sen = re.sub('[^a-zA-Z]', " ", sentence[i])  
    sen = sen.lower()                            
    sen = sen.split()                         
    sen = ' '.join([i for i in sen if i not in stopwords.words('english')])   
    corpus.append(sen)
#print(corpus)

[nltk_data] Downloading package stopwords to /home/riya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Vector Representation of Sentences

In [4]:
from gensim.models import Word2Vec
all_words = [i.split() for i in corpus]
model = Word2Vec(all_words, min_count=1)
print(model.wv)

<gensim.models.keyedvectors.KeyedVectors object at 0x7fab8635f760>




In [5]:
sent_vector=[]
for i in corpus:
    if len(i) != 0:
        plus = 0
        for j in i.split():
            plus+= model.wv[j]
        plus = plus/len(i.split())
    sent_vector.append(plus)
#print(sent_vector)

# Clustering

In [6]:
import numpy as np
from sklearn.cluster import KMeans
n_clusters = 10
kmeans = KMeans(n_clusters, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(sent_vector)

In [7]:
from scipy.spatial import distance
my_list=[]
summaryK = ""
listK = []
for i in range(n_clusters):
    my_dict={}
    
    for j in range(len(y_kmeans)):
        
        if y_kmeans[j]==i:
            my_dict[j] =  distance.euclidean(kmeans.cluster_centers_[i],sent_vector[j])
    min_distance = min(my_dict.values())
    my_list.append(min(my_dict, key=my_dict.get))
                          
for i in sorted(my_list):
    summaryK += sentence[i]
    listK.append(sentence[i])

print(summaryK)

Thus, in addition to the previously characterized clavulanic acid-inhibited extended-spectrum beta-lactamase CGA-1 of Ambler class A, C. gleum produces a very likely chromosome-borne class B beta-lactamase.Bacterial strains.Cloning and analysis of recombinant plasmids.Southern hybridization was performed as previously described  with whole-cell DNA of C. gleum CIP 103039 by using the enhanced chemiluminescence nonradioactive labeling and detection kit (Amersham Pharmacia Biotech) with a 628-bp PCR-obtained probe with primers internal to blaCGB-1 (primer 1: 5'-GCAAACGCCCGGATACAACAG-3'; primer 2; 5'-TTCCATTCATCATGTCCGGG-3')  beta-Lactamase purification.DNA sequencing and protein analysis.| The nucleotide sequence and deduced beta-lactamase amino acid sequence reported in this work have been assigned to the GenBank and EMBL databases under the accession no. .
Cloning experiments.| No plasmid was detected in C. gleum CIP 103039, and direct conjugation experiments failed to transfer any bet

# Cosine Matrix

In [8]:
sim_mat = np.zeros([len(sentence), len(sentence)])
from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(sentence)):
  for j in range(len(sentence)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sent_vector[i].reshape(1,100), sent_vector[j].reshape(1,100))[0,0]

# TextRank

In [9]:
import networkx as nx
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [10]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentence)), reverse=True)

In [11]:
# Extract top 10 sentences as the summary
summaryT = ""
listT = []
for i in range(10):
    summaryT += ranked_sentences[i][1]
    listT.append(ranked_sentences[i][1])

print(summaryT)

Thus, in addition to the previously characterized clavulanic acid-inhibited extended-spectrum beta-lactamase CGA-1 of Ambler class A, C. gleum produces a very likely chromosome-borne class B beta-lactamase.The mature protein (named CGB-1 for C. gleum class B beta-lactamase) expressed in E. coli DH10B had a relative molecular mass determined experimentally to be ca.The relative molecular mass of the beta-lactamase expressed by a culture of E. coli DH10B(pCGB-1) was estimated by SDS-PAGE analysis, as previously described .| No plasmid was detected in C. gleum CIP 103039, and direct conjugation experiments failed to transfer any beta-lactam resistance marker from C. gleum CIP 103039 to nalidixic acid-resistant E. coli JM109.| Comparison of the amino acid sequence of beta-lactamase CGB-1 with those of IND-1, IND-2, IND-3, and IND-4 from C. indologenes Comparison of the amino acid sequence of beta-lactamase CGB-1 with those of IND-1, IND-2, IND-3, and IND-4 from C. indologenes  and BlaB fro

In [16]:
file = open("long.txt","r")
longsum = file.read()
file.close()
import nltk
nltk.download('punkt')   # one time execution
from nltk.tokenize import sent_tokenize
long_summary = sent_tokenize(longsum)
print(long_summary)

In [24]:
from rouge import Rouge
rouge = Rouge()
print("TextRanking Scores : ")
scores1 = rouge.get_scores(summaryT,longsum)
print(scores1)
print("\n")
print("KMeans Scores : ")
scores2 = rouge.get_scores(summaryK,longsum)
print(scores2)


TextRanking Scores : 
[{'rouge-1': {'r': 0.6197183098591549, 'p': 0.3120567375886525, 'f': 0.41509433516776434}, 'rouge-2': {'r': 0.4, 'p': 0.14107883817427386, 'f': 0.2085889532001581}, 'rouge-l': {'r': 0.6056338028169014, 'p': 0.3049645390070922, 'f': 0.4056603729036134}}]


KMeans Scores : 
[{'rouge-1': {'r': 0.4788732394366197, 'p': 0.3008849557522124, 'f': 0.3695652126518195}, 'rouge-2': {'r': 0.3411764705882353, 'p': 0.19863013698630136, 'f': 0.25108224643091404}, 'rouge-l': {'r': 0.4788732394366197, 'p': 0.3008849557522124, 'f': 0.3695652126518195}}]
