In [28]:
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams
from nltk.collocations import *
from snowballstemmer import stemmer
from nltk import word_tokenize

In [40]:
file = open('Mustafa_Kemal_Atatürk.txt',encoding="utf-8")
file=file.read()

tokenized = file.split()
tokenized = [token.lower() for token in tokenized if token.isalpha()]
tokenized

['mustafa',
 'kemal',
 'atatürk',
 'kasım',
 'türk',
 'devlet',
 'adamı',
 've',
 'türkiye',
 'mustafa',
 'kemal',
 'birinci',
 'dünya',
 'savaşı',
 'sırasında',
 'osmanlı',
 'ordusuna',
 'hizmet',
 'eden',
 'çanakkale',
 'sina',
 've',
 'filistin',
 'ise',
 'yıldırım',
 'orduları',
 'komutanlığına',
 'savaşın',
 'sonunda',
 'osmanlı',
 'yenilgisini',
 'takiben',
 'kurtuluş',
 'savaşı',
 'ile',
 'simgelenen',
 'türk',
 'ulusal',
 'önderlik',
 'türk',
 'kurtuluş',
 'savaşı',
 'sürecinde',
 'ankara',
 'türk',
 'orduları',
 'başkomutanı',
 'olarak',
 'sakarya',
 'meydan',
 'başarısından',
 'dolayı',
 'eylül',
 'tarihinde',
 'unvanını',
 'aldı',
 've',
 'mareşalliğe',
 'askerî',
 've',
 'siyasi',
 'eylemleriyle',
 'i̇tilaf',
 'devletleri',
 've',
 'onların',
 'iş',
 'birlikçilerine',
 'karşı',
 'zafer',
 'savaşın',
 'ardından',
 'cumhuriyet',
 'halk',
 'halk',
 'fırkası',
 'adıyla',
 'kurdu',
 've',
 'ilk',
 'genel',
 'başkanı',
 'ekim',
 'cumhuriyetin',
 'ilanının',
 'akabinde',
 'cumhurb

In [41]:
from nltk.probability import FreqDist
fdist=FreqDist(tokenized)
fdist.most_common(10)

[('mustafa', 635),
 ('ve', 598),
 ('kemal', 544),
 ('bir', 282),
 ('bu', 175),
 ('ile', 163),
 ('olarak', 107),
 ('için', 95),
 ('türk', 91),
 ('sonra', 91)]

In [42]:
bigram= nltk.collocations.BigramAssocMeasures()
finder2 = BigramCollocationFinder.from_words(tokenized)
finder2.apply_freq_filter(5)
print("Bigram :")

sorted(finder2.ngram_fd.items(),key=lambda t:(-t[1],t[0]))[:10]

Bigram :


[(('mustafa', 'kemal'), 517),
 (('daha', 'sonra'), 22),
 (('büyük', 'millet'), 19),
 (('kemal', 'mustafa'), 16),
 (('ali', 'rıza'), 15),
 (('gazi', 'mustafa'), 15),
 (('türkiye', 'büyük'), 15),
 (('olarak', 'mustafa'), 14),
 (('ve', 'mustafa'), 14),
 (('i̇ttihat', 've'), 13)]

In [43]:
trigram=nltk.collocations.TrigramAssocMeasures
finder3=TrigramCollocationFinder.from_words(tokenized)
finder3.apply_freq_filter(5) #eşik değeri
print("Trigram :")
sorted(finder3.ngram_fd.items(),key=lambda t:(-t[1],t[0]))[:10]

Trigram :


[(('türkiye', 'büyük', 'millet'), 15),
 (('olarak', 'mustafa', 'kemal'), 14),
 (('kemal', 'mustafa', 'kemal'), 12),
 (('mustafa', 'kemal', 'mustafa'), 12),
 (('mustafa', 'kemal', 'paşa'), 12),
 (('mustafa', 'kemal', 've'), 12),
 (('mustafa', 'kemal', 'atatürk'), 11),
 (('mustafa', 'kemal', 'bu'), 10),
 (('mustafa', 'kemal', 'kemal'), 10),
 (('ve', 'mustafa', 'kemal'), 10)]

In [44]:
def open_file(path,file_name):
    documents_list = []
    titles=[]
    with open( os.path.join(path, file_name) ,"r",encoding="utf-8") as fin:
        for line in fin.readlines():
            line = line.strip()
            documents_list.append(line)
    print("Total Number of Documents:",len(documents_list))
    titles.append( line[0:min(len(line),200)] )
    return documents_list,titles

In [45]:
def data_process(doc_set):
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = set(stopwords.words('turkish'))
    turkStem = stemmer('turkish')
    texts = []
    for i in doc_set:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in en_stop]
       # stemmed_tokens = [turkStem.stemWord(i) for i in stopped_tokens]
        texts.append(stopped_tokens)
    return texts

In [46]:
def prepare_corpus(doc_clean):
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    return dictionary,doc_term_matrix

In [47]:
def create_LSA_model(doc_clean,number_of_topics,words):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [48]:
document_list,titles=open_file("","Mustafa_Kemal_Atatürk.txt")
clean_text=data_process(document_list)
model=create_LSA_model(clean_text,1,2)

Total Number of Documents: 1416
[(0, '0.598*"kemal" + 0.557*"mustafa"')]
