## GERMAN TEXT 

## Import needed modules

In [2]:
from nltk.corpus import stopwords #you can remove stop words for speed
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

## Opening file and split into sentences

In [3]:
file = open("C:\\Users\\sowky\\Downloads\\german text.txt", "r")
#This file contains one paragraph of multiple sentences
filedata = file.readlines()
article = filedata[0].split(". ") #Just do the first paragraph
sentences = []
for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

Das Buch liegt auf dem Tisch
Der alte Mann geht langsam durch den Park
Die Katze sitzt auf dem Fensterbrett und schaut nach draußen
Es regnet stark draußen, und die Straßen sind nass
In der Ferne hört man das Rauschen des Flusses.


In [4]:
print("Sentences are ", sentences)

Sentences are  [['Das', 'Buch', 'liegt', 'auf', 'dem', 'Tisch'], ['Der', 'alte', 'Mann', 'geht', 'langsam', 'durch', 'den', 'Park'], ['Die', 'Katze', 'sitzt', 'auf', 'dem', 'Fensterbrett', 'und', 'schaut', 'nach', 'draußen'], ['Es', 'regnet', 'stark', 'draußen,', 'und', 'die', 'Straßen', 'sind', 'nass'], ['In', 'der', 'Ferne', 'hört', 'man', 'das', 'Rauschen', 'des', 'Flusses.']]


## Function to calculate similarity

In [5]:
#counting the number of common words
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

## Create the similarity matrix (slide 17)

In [6]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
         continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])
print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.         0.25819889 0.         0.13608276]
 [0.         0.         0.         0.         0.11785113]
 [0.25819889 0.         0.         0.21081851 0.        ]
 [0.         0.         0.21081851 0.         0.        ]
 [0.13608276 0.11785113 0.         0.         0.        ]]


## Get the pagerank scores

In [7]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.257723537554717, 1: 0.10781184801501686, 2: 0.2946431369514488, 3: 0.14257377595942577, 4: 0.19724770151939175}


In [8]:
#Sort sentences by pagerank
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.2946431369514488, ['Die', 'Katze', 'sitzt', 'auf', 'dem', 'Fensterbrett', 'und', 'schaut', 'nach', 'draußen']), (0.257723537554717, ['Das', 'Buch', 'liegt', 'auf', 'dem', 'Tisch']), (0.19724770151939175, ['In', 'der', 'Ferne', 'hört', 'man', 'das', 'Rauschen', 'des', 'Flusses.']), (0.14257377595942577, ['Es', 'regnet', 'stark', 'draußen,', 'und', 'die', 'Straßen', 'sind', 'nass']), (0.10781184801501686, ['Der', 'alte', 'Mann', 'geht', 'langsam', 'durch', 'den', 'Park'])]


## Pick the top “n” sentences

In [10]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  3


## Finish off by printing summary

In [11]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 Die Katze sitzt auf dem Fensterbrett und schaut nach draußen. Das Buch liegt auf dem Tisch. In der Ferne hört man das Rauschen des Flusses.


## KOREAN TEXT 

## Import needed modules


In [13]:
from nltk.corpus import stopwords #you can remove stop words for speed
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

## Opening file and split into sentences

In [28]:
file = open("C:\\Users\\sowky\\Downloads\\korean text.txt", "r", encoding="utf-8")
filedata = file.read()
sentences = filedata.split(". ")  # Split by periods to separate sentences
article = filedata[0].split(". ") #Just do the first paragraph
# Print and append each sentence to the list
for sentence in sentences:
    print(sentence)


안녕하세요
반갑습니다
오늘은 날씨가 좋네요
학교에 가는 길이 멀어요
저는 커피를 좋아해요
배고파서 밥을 먹고 싶어요
책을 읽는 것이 취미입니다
운동을 해서 몸이 좋아졌어요
친구들과 함께 시간을 보내는 게 좋아요
가을에 단풍이 아름답습니다
여름에는 바다로 여행을 가고 싶어요
겨울에 눈이 내리면 눈사람을 만들어요
사랑은 따뜻한 감정입니다
일을 열심히 하면 좋은 결과가 있을 거예요
음악을 듣는 것이 마음을 편안하게 해줍니다
저는 한국 음식을 좋아합니다
여행을 하면 새로운 경험을 할 수 있어요
화이팅! 무엇이든 해낼 수 있어요.


In [20]:
print("Sentences are ", sentences)

Sentences are  ['안녕하세요', '반갑습니다', '오늘은 날씨가 좋네요', '학교에 가는 길이 멀어요', '저는 커피를 좋아해요', '배고파서 밥을 먹고 싶어요', '책을 읽는 것이 취미입니다', '운동을 해서 몸이 좋아졌어요', '친구들과 함께 시간을 보내는 게 좋아요', '가을에 단풍이 아름답습니다', '여름에는 바다로 여행을 가고 싶어요', '겨울에 눈이 내리면 눈사람을 만들어요', '사랑은 따뜻한 감정입니다', '일을 열심히 하면 좋은 결과가 있을 거예요', '음악을 듣는 것이 마음을 편안하게 해줍니다', '저는 한국 음식을 좋아합니다', '여행을 하면 새로운 경험을 할 수 있어요', '화이팅! 무엇이든 해낼 수 있어요.']


## Function to calculate similarity

In [21]:
#counting the number of common words
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

## Create the similarity matrix (slide 17)¶

In [22]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
         continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])
print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.         0.12403473 0.10259784 0.12403473 0.09534626
  0.         0.09759001 0.06984303 0.         0.07784989 0.0766965
  0.         0.12060454 0.13046561 0.         0.12171612 0.07784989]
 [0.         0.         0.         0.         0.         0.
  0.2        0.         0.         0.3354102  0.07784989 0.
  0.23094011 0.         0.13046561 0.19518001 0.         0.        ]
 [0.12403473 0.         0.         0.50902781 0.46153846 0.41391868
  0.3721042  0.48418203 0.5197777  0.34668762 0.48280455 0.42808634
  0.35805744 0.59836607 0.4045567  0.42365927 0.49065338 0.43452409]
 [0.10259784 0.         0.50902781 0.         0.50902781 0.53802759
  0.56428809 0.60075141 0.60908821 0.5161854  0.6789146  0.62951158
  0.35540933 0.61868822 0.56888331 0.50062617 0.62439054 0.63897845]
 [0.12403473 0.         0.46153846 0.50902781 0.         0.41391868
  0.43412157 0.60522753 0.60640731 0.34668762 0.48280455 0.42808634
  0.28644595 0.52357031 0.48546804 0.60522

## Get the pagerank scores


In [23]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.015883654390382813, 1: 0.01576381440435151, 2: 0.051495184457707927, 3: 0.062132092943485234, 4: 0.05418849799194232, 5: 0.05714965413001391, 6: 0.0622101774637092, 7: 0.06311624066649558, 8: 0.06514762299611589, 9: 0.05586524510806801, 10: 0.06485938665397875, 11: 0.06058016701507174, 12: 0.04806597468938474, 13: 0.06771318864915736, 14: 0.06616863876673865, 15: 0.06242211260260011, 16: 0.06774627770331171, 17: 0.059492069367484746}


In [24]:
#Sort sentences by pagerank
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.06774627770331171, '여행을 하면 새로운 경험을 할 수 있어요'), (0.06771318864915736, '일을 열심히 하면 좋은 결과가 있을 거예요'), (0.06616863876673865, '음악을 듣는 것이 마음을 편안하게 해줍니다'), (0.06514762299611589, '친구들과 함께 시간을 보내는 게 좋아요'), (0.06485938665397875, '여름에는 바다로 여행을 가고 싶어요'), (0.06311624066649558, '운동을 해서 몸이 좋아졌어요'), (0.06242211260260011, '저는 한국 음식을 좋아합니다'), (0.0622101774637092, '책을 읽는 것이 취미입니다'), (0.062132092943485234, '학교에 가는 길이 멀어요'), (0.06058016701507174, '겨울에 눈이 내리면 눈사람을 만들어요'), (0.059492069367484746, '화이팅! 무엇이든 해낼 수 있어요.'), (0.05714965413001391, '배고파서 밥을 먹고 싶어요'), (0.05586524510806801, '가을에 단풍이 아름답습니다'), (0.05418849799194232, '저는 커피를 좋아해요'), (0.051495184457707927, '오늘은 날씨가 좋네요'), (0.04806597468938474, '사랑은 따뜻한 감정입니다'), (0.015883654390382813, '안녕하세요'), (0.01576381440435151, '반갑습니다')]


## Pick the top “n” sentences

In [25]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  14


## Finish off by printing summary

In [26]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 여 행 을   하 면   새 로 운   경 험 을   할   수   있 어 요. 일 을   열 심 히   하 면   좋 은   결 과 가   있 을   거 예 요. 음 악 을   듣 는   것 이   마 음 을   편 안 하 게   해 줍 니 다. 친 구 들 과   함 께   시 간 을   보 내 는   게   좋 아 요. 여 름 에 는   바 다 로   여 행 을   가 고   싶 어 요. 운 동 을   해 서   몸 이   좋 아 졌 어 요. 저 는   한 국   음 식 을   좋 아 합 니 다. 책 을   읽 는   것 이   취 미 입 니 다. 학 교 에   가 는   길 이   멀 어 요. 겨 울 에   눈 이   내 리 면   눈 사 람 을   만 들 어 요. 화 이 팅 !   무 엇 이 든   해 낼   수   있 어 요 .. 배 고 파 서   밥 을   먹 고   싶 어 요. 가 을 에   단 풍 이   아 름 답 습 니 다. 저 는   커 피 를   좋 아 해 요


## SWEDISH TEXT 

## Import needed modules


In [14]:
from nltk.corpus import stopwords #you can remove stop words for speed
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

## Opening file and split into sentences


In [30]:
file = open("C:\\Users\\sowky\\Downloads\\swedish text.txt", "r")
#This file contains one paragraph of multiple sentences
filedata = file.readlines()
article = filedata[0].split(". ") #Just do the first paragraph
sentences = []
for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

Hej! Jag hoppas att du har en bra dag
Solen skiner och fÃ¥glarna sjunger
Jag Ã¤lskar att gÃ¥ pÃ¥ promenader i parken
Det Ã¤r sÃ¥ vackert nÃ¤r blommorna blommar
Jag dricker gÃ¤rna en kopp kaffe pÃ¥ morgonen
Sedan gÃ¥r jag till jobbet och mÃ¶ter mina kollegor
PÃ¥ lunchrasten Ã¤ter jag en god sallad
PÃ¥ kvÃ¤llen lagar jag middag och tittar pÃ¥ TV
Ibland trÃ¤ffar jag mina vÃ¤nner och vi gÃ¥r pÃ¥ bio
Jag tycker om att resa och utforska nya platser
Det Ã¤r sÃ¥ spÃ¤nnande att upptÃ¤cka olika kulturer
NÃ¤r jag Ã¤r hemma Ã¤lskar jag att lÃ¤sa bÃ¶cker
Det Ã¤r sÃ¥ avkopplande att fÃ¶rlora sig i en bra historia
Jag fÃ¶redrar att spendera tid utomhus nÃ¤r vÃ¤dret Ã¤r fint
Jag Ã¤lskar att bada i havet pÃ¥ sommaren
PÃ¥ vintern njuter jag av att Ã¥ka skidor i fjÃ¤llen
Livet Ã¤r fullt av smÃ¥ glÃ¤djeÃ¤mnen att upptÃ¤cka varje dag
Jag Ã¤r tacksam fÃ¶r allt det vackra i livet
Att vara omgiven av mÃ¤nniskor jag Ã¤lskar Ã¤r det bÃ¤sta jag vet
Jag kÃ¤nner mig lyckligt lottad att ha sÃ¥dana fina vÃ¤nner
Jag 

In [31]:
print("Sentences are ", sentences)

Sentences are  [['Hej!', 'Jag', 'hoppas', 'att', 'du', 'har', 'en', 'bra', 'dag'], ['Solen', 'skiner', 'och', 'fÃ¥glarna', 'sjunger'], ['Jag', 'Ã¤lskar', 'att', 'gÃ¥', 'pÃ¥', 'promenader', 'i', 'parken'], ['Det', 'Ã¤r', 'sÃ¥', 'vackert', 'nÃ¤r', 'blommorna', 'blommar'], ['Jag', 'dricker', 'gÃ¤rna', 'en', 'kopp', 'kaffe', 'pÃ¥', 'morgonen'], ['Sedan', 'gÃ¥r', 'jag', 'till', 'jobbet', 'och', 'mÃ¶ter', 'mina', 'kollegor'], ['PÃ¥', 'lunchrasten', 'Ã¤ter', 'jag', 'en', 'god', 'sallad'], ['PÃ¥', 'kvÃ¤llen', 'lagar', 'jag', 'middag', 'och', 'tittar', 'pÃ¥', 'TV'], ['Ibland', 'trÃ¤ffar', 'jag', 'mina', 'vÃ¤nner', 'och', 'vi', 'gÃ¥r', 'pÃ¥', 'bio'], ['Jag', 'tycker', 'om', 'att', 'resa', 'och', 'utforska', 'nya', 'platser'], ['Det', 'Ã¤r', 'sÃ¥', 'spÃ¤nnande', 'att', 'upptÃ¤cka', 'olika', 'kulturer'], ['NÃ¤r', 'jag', 'Ã¤r', 'hemma', 'Ã¤lskar', 'jag', 'att', 'lÃ¤sa', 'bÃ¶cker'], ['Det', 'Ã¤r', 'sÃ¥', 'avkopplande', 'att', 'fÃ¶rlora', 'sig', 'i', 'en', 'bra', 'historia'], ['Jag', 'fÃ¶redrar', 'at

## Function to calculate similarity


In [32]:
#counting the number of common words
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

## Create the similarity matrix (slide 17)¶


In [33]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
         continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])
print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.         0.23570226 0.         0.23570226 0.11111111
  0.25197632 0.10050378 0.10540926 0.22222222 0.11785113 0.30151134
  0.30151134 0.21081851 0.23570226 0.21081851 0.21081851 0.11111111
  0.26726124 0.21081851 0.19245009 0.2773501 ]
 [0.         0.         0.         0.         0.         0.1490712
  0.         0.13483997 0.14142136 0.1490712  0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.12909944 0.        ]
 [0.23570226 0.         0.         0.         0.25       0.11785113
  0.26726124 0.31980107 0.2236068  0.23570226 0.125      0.42640143
  0.21320072 0.2236068  0.625      0.4472136  0.1118034  0.23570226
  0.37796447 0.2236068  0.10206207 0.09805807]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.40089186 0.22792115
  0.34188173 0.23904572 0.         0.         0.11952286 0.25197632
  0.20203051 0.         0.         0.        ]
 [

## Get the pagerank scores


In [34]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.05136709363849329, 1: 0.015105655407931119, 2: 0.06139632020618654, 3: 0.026969234724040985, 4: 0.04274620939019202, 5: 0.03821803919048399, 6: 0.044951914961592496, 7: 0.04616666248865532, 8: 0.045403655491956726, 9: 0.04905210958855436, 10: 0.04041698612036363, 11: 0.06869850204108985, 12: 0.04547566937387921, 13: 0.05063112543546136, 14: 0.06139632020618654, 15: 0.0550341354172794, 16: 0.03761961020691378, 17: 0.04812500008121058, 18: 0.06635011717852282, 19: 0.042067087952555086, 20: 0.03664497013911646, 21: 0.02616358075933443}


In [35]:
#Sort sentences by pagerank
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)


Indexes of top ranked_sentence order are 

 [(0.06869850204108985, ['NÃ¤r', 'jag', 'Ã¤r', 'hemma', 'Ã¤lskar', 'jag', 'att', 'lÃ¤sa', 'bÃ¶cker']), (0.06635011717852282, ['Att', 'vara', 'omgiven', 'av', 'mÃ¤nniskor', 'jag', 'Ã¤lskar', 'Ã¤r', 'det', 'bÃ¤sta', 'jag', 'vet']), (0.06139632020618654, ['Jag', 'Ã¤lskar', 'att', 'gÃ¥', 'pÃ¥', 'promenader', 'i', 'parken']), (0.06139632020618654, ['Jag', 'Ã¤lskar', 'att', 'bada', 'i', 'havet', 'pÃ¥', 'sommaren']), (0.0550341354172794, ['PÃ¥', 'vintern', 'njuter', 'jag', 'av', 'att', 'Ã¥ka', 'skidor', 'i', 'fjÃ¤llen']), (0.05136709363849329, ['Hej!', 'Jag', 'hoppas', 'att', 'du', 'har', 'en', 'bra', 'dag']), (0.05063112543546136, ['Jag', 'fÃ¶redrar', 'att', 'spendera', 'tid', 'utomhus', 'nÃ¤r', 'vÃ¤dret', 'Ã¤r', 'fint']), (0.04905210958855436, ['Jag', 'tycker', 'om', 'att', 'resa', 'och', 'utforska', 'nya', 'platser']), (0.04812500008121058, ['Jag', 'Ã¤r', 'tacksam', 'fÃ¶r', 'allt', 'det', 'vackra', 'i', 'livet']), (0.04616666248865532, ['PÃ¥', 'kv

## Pick the top “n” sentences

In [36]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))


How many sentences do you want in the summary?  20


## Finish off by printing summary

In [37]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 NÃ¤r jag Ã¤r hemma Ã¤lskar jag att lÃ¤sa bÃ¶cker. Att vara omgiven av mÃ¤nniskor jag Ã¤lskar Ã¤r det bÃ¤sta jag vet. Jag Ã¤lskar att gÃ¥ pÃ¥ promenader i parken. Jag Ã¤lskar att bada i havet pÃ¥ sommaren. PÃ¥ vintern njuter jag av att Ã¥ka skidor i fjÃ¤llen. Hej! Jag hoppas att du har en bra dag. Jag fÃ¶redrar att spendera tid utomhus nÃ¤r vÃ¤dret Ã¤r fint. Jag tycker om att resa och utforska nya platser. Jag Ã¤r tacksam fÃ¶r allt det vackra i livet. PÃ¥ kvÃ¤llen lagar jag middag och tittar pÃ¥ TV. Det Ã¤r sÃ¥ avkopplande att fÃ¶rlora sig i en bra historia. Ibland trÃ¤ffar jag mina vÃ¤nner och vi gÃ¥r pÃ¥ bio. PÃ¥ lunchrasten Ã¤ter jag en god sallad. Jag dricker gÃ¤rna en kopp kaffe pÃ¥ morgonen. Jag kÃ¤nner mig lyckligt lottad att ha sÃ¥dana fina vÃ¤nner. Det Ã¤r sÃ¥ spÃ¤nnande att upptÃ¤cka olika kulturer. Sedan gÃ¥r jag till jobbet och mÃ¶ter mina kollegor. Livet Ã¤r fullt av smÃ¥ glÃ¤djeÃ¤mnen att upptÃ¤cka varje dag. Jag fÃ¶rsÃ¶ker leva varje dag till fullo och u