In [16]:
# Sentence segmentation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

text = "Dhawan scored 109 runs off 105 balls, and shared a 158-run stand with skipper Virat Kohli who scored 75. The match was interrupted by rain and inclement weather on more than one occasion which saw the target being reduced to 202 runs off 28 overs for South Africa. The Indian bowlers dominated South African batsmen in the initial stages."

sentences = sent_tokenize(text)
print(sentences)

['Dhawan scored 109 runs off 105 balls, and shared a 158-run stand with skipper Virat Kohli who scored 75.', 'The match was interrupted by rain and inclement weather on more than one occasion which saw the target being reduced to 202 runs off 28 overs for South Africa.', 'The Indian bowlers dominated South African batsmen in the initial stages.']


[nltk_data] Downloading package punkt to C:\Users\Sahil
[nltk_data]     Raj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
import numpy as np
emptyarray= np.empty((len(sentences),1,3),dtype=object)
# print(emptyarray)
for s in range(len(sentences)):
    emptyarray[s][0][0] = sentences[s]
    emptyarray[s][0][1] = s
emptyarray

array([[['Dhawan scored 109 runs off 105 balls, and shared a 158-run stand with skipper Virat Kohli who scored 75.',
         0, None]],

       [['The match was interrupted by rain and inclement weather on more than one occasion which saw the target being reduced to 202 runs off 28 overs for South Africa.',
         1, None]],

       [['The Indian bowlers dominated South African batsmen in the initial stages.',
         2, None]]], dtype=object)

In [18]:
# Tokenization

from string import punctuation
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [19]:
# Bi-grams

bi_token=[]
bi_token_length=[]
tri_token_length=[]

for u in range(len(sentences)):
    sent_split1=[w.lower() for w in sentences[u].split(" ")]
    sent_split=[w for w in sent_split1 if w not in stop_words and w not in punctuation and not w.isdigit()]
#     print(sent_split)
    bigrams_list = [bigram for bigram in nltk.bigrams(sent_split)]
#     print(bigrams_list)
    bi_token.append(bigrams_list)
    bi_token_length.append(len(bi_token[u]))

bi_tokens = [(int(o) / max(bi_token_length))*100 for o in bi_token_length]
print(bi_tokens)
print("bitokens feature vector:",(bi_token_length))

[84.61538461538461, 100.0, 53.84615384615385]
bitokens feature vector: [11, 13, 7]


In [20]:
# Tri-grams

tri_token=[]
for u in range(len(sentences)):
    sent_split2=[w.lower() for w in sentences[u].split(" ")]
    sent_split3=[w for w in sent_split2 if w not in stop_words and w not in punctuation and not w.isdigit()]
    trigrams_list = [trigram for trigram in nltk.trigrams(sent_split3)]
    tri_token.append(trigrams_list)
    tri_token_length.append(len(tri_token[u]))
tri_tokens = [(int(m) / max(tri_token_length))*100 for m in tri_token_length]
print(tri_tokens)
print("tritokens feature vector:",tri_token_length)

[83.33333333333334, 100.0, 50.0]
tritokens feature vector: [10, 12, 6]


In [21]:
# Sentence Position Feature

import math
def position(l):
    return [index for index, value in enumerate(sentences)]

sent_position= (position(sentences))
print("sentence position:",sent_position)
num_sent=len(sent_position)
print("Total number of sentences:",num_sent)

sentence position: [0, 1, 2]
Total number of sentences: 3


In [22]:
position = []
position_rbm = []

# sentence postion feature of first sentence
sent_pos1_rbm = 1
sent_pos1 = 100
position.append(sent_pos1)
position_rbm.append(sent_pos1_rbm)

# for all sentences except first and last
for x in range(1,num_sent-1):
    s_p= ((num_sent-x)/num_sent)*100
    position.append(s_p)
    s_p_rbm = (num_sent-x)/num_sent
    position_rbm.append(s_p_rbm)
    
# sentence postion feature of last sentence
sent_pos2_rbm = 1
sent_pos2 = 100
position.append(sent_pos2)
position_rbm.append(sent_pos2_rbm)

print("Sentence position feature vector:",position_rbm)

Sentence position feature vector: [1, 0.6666666666666666, 1]


In [23]:
# Converting Sentences to Vectors

def convertToVSM(sentences):
    vocabulary = []
    for sents in sentences:
        vocabulary.extend(sents)
    vocabulary = list(set(vocabulary))
    vectors = []
    for sents in sentences:
        vector = []
        for tokenss in vocabulary:
            vector.append(sents.count(tokenss))
        vectors.append(vector)
    return vectors
VSM=convertToVSM(sentences)
# print(sentences)
print("SentenceVectors:",VSM)

SentenceVectors: [[0, 5, 1, 8, 1, 1, 2, 3, 1, 1, 2, 1, 0, 0, 4, 2, 5, 3, 0, 0, 3, 5, 3, 2, 1, 2, 0, 5, 3, 1, 0, 7, 1, 18, 0, 1, 0, 7, 1, 4], [0, 8, 1, 10, 0, 0, 4, 4, 0, 2, 1, 0, 2, 1, 15, 4, 11, 0, 3, 3, 1, 10, 11, 1, 0, 7, 1, 4, 0, 0, 1, 11, 1, 28, 1, 0, 1, 5, 0, 7], [1, 3, 0, 6, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 6, 1, 7, 0, 2, 0, 2, 3, 6, 0, 0, 1, 1, 3, 0, 0, 0, 2, 1, 10, 1, 0, 0, 4, 0, 7]]


In [24]:
# TF-ISF feature and Centroid Calculation

sentencelength=len(sentences)
def calcMeanTF_ISF(VSM, index):
    vocab_len = len(VSM[index])
    sentences_len = len(VSM)
    count = 0
    tfisf = 0
    for i in range(vocab_len):
        tf = VSM[index][i]
        if(tf>0):
            count += 1
            sent_freq = 0
            for j in range(sentences_len):
                if(VSM[j][i]>0): sent_freq += 1
            tfisf += (tf)*(1.0/sent_freq)
    if(count > 0):
        mean_tfisf = tfisf/count
    else:
        mean_tfisf = 0
    return tf, (1.0/sent_freq), mean_tfisf
tfvec=[]
isfvec=[]
tfisfvec=[]
tfisfvec_rbm=[]
for i in range(sentencelength):
    x,y,z=calcMeanTF_ISF(VSM,i)
    tfvec.append(x)
    isfvec.append(y)
    tfisfvec.append(z*100)
    tfisfvec_rbm.append(z)
print("TF-ISF vector:",tfisfvec_rbm)
maxtf_isf=max(tfisfvec_rbm)
centroid=[]
centroid.append(maxtf_isf)
print("Max TF-ISF:",centroid)
centroid=(max(VSM))
print("Centroid:",centroid)

TF-ISF vector: [1.4462365591397852, 2.005747126436782, 1.0833333333333333]
Max TF-ISF: [2.005747126436782]
Centroid: [1, 3, 0, 6, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 6, 1, 7, 0, 2, 0, 2, 3, 6, 0, 0, 1, 1, 3, 0, 0, 0, 2, 1, 10, 1, 0, 0, 4, 0, 7]


In [25]:
# Cosine Similarity between Centroid and Sentences

from numpy import dot
from numpy.linalg import norm
cosine_similarity=[]
cosine_similarity_rbm=[]
for z in range(sentencelength):
    cos_simi = ((dot(centroid, VSM[z])/(norm(centroid)*norm(VSM[z])))*100)
    cosine_similarity.append(cos_simi)
    cos_simi_rbm = (dot(centroid, VSM[z])/(norm(centroid)*norm(VSM[z])))
    cosine_similarity_rbm.append(cos_simi_rbm)
print("Cosine Similarity Vector:",cosine_similarity_rbm)

Cosine Similarity Vector: [0.8585759941625211, 0.9186287541105971, 1.0]


In [26]:
# Sentence length feature

sent_word=[]
for u in range(len(sentences)):
    sent_split1=[w.lower() for w in sentences[u].split(" ")]
    sent_split=[w for w in sent_split1 if w not in stop_words and w not in punctuation and not w.isdigit()]
    a=(len(sent_split))
    sent_word.append(a)
# print(sent_word)

# LENGTH OF SENTENCE/ LONGEST SENTENCE
longest_sent=max(sent_word)
sent_length=[]
sent_length_rbm=[]
for x in sent_word:
    sent_length.append((x/longest_sent)*100)
    sent_length_rbm.append(x/longest_sent)
#print(sent_length)

print("Sentence length feature vector:",sent_length_rbm)

Sentence length feature vector: [0.8571428571428571, 1.0, 0.5714285714285714]


In [27]:
# Numeric token Feature

import re
num_word=[]
numeric_token=[]
numeric_token_rbm=[]
for u in range(len(sentences)):
    sent_split4=sentences[u].split(" ")
    e=re.findall("\d+",sentences[u])
    noofwords=(len(e))
    num_word.append(noofwords)
    numeric_token.append((num_word[u]/sent_word[u])*100)
    numeric_token_rbm.append(num_word[u]/sent_word[u])
print("Numeric token feature vector:",numeric_token_rbm)

Numeric token feature vector: [0.3333333333333333, 0.14285714285714285, 0.0]
