In [None]:
import nltk
from nltk import FreqDist


#nltk.download()

with open('2020.txt', 'r', encoding = "UTF-8") as file:
    data = file.read()

### Tokeniziranje teksta

In [None]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(data)
print("Tokenizirani tekst\n")
print(tokens)
print("Broj tokena")

### Tokeniziranje teksta na nacin da vrati samo rijeci bez interpunkcijskih znakova

In [None]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(data)
print("Tokenizirani tekst bez interpunkcijskih znakova\n")
print(tokens)

### Rijeci s velikim slovima pretvaramo u mala slova kako bi mogli lakse maknuti stop wordove

In [None]:
from nltk.corpus import stopwords

tokens = [word.lower() for word in tokens]
fresh_tokens = list(tokens)

fresh_tokens_without_stop_words = []
for word in fresh_tokens:
    if word not in set(stopwords.words("english")):
        fresh_tokens_without_stop_words.append(word)

word_count = Counter(fresh_tokens_without_stop_words)
print("Najfrekventnije riječi prije lematizacije i stematizacije")
print(word_count.most_common(30))

fd = FreqDist(fresh_tokens_without_stop_words)
fd.plot(30, title='30 najfrekventnijih riječi prije lematizacije i stematizacije')

### Stematizacija - PorterStemmer

In [None]:
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()
stemmed_tokens = []

for word in tokens:
    stemmed_tokens.append(porter_stemmer.stem(word))

print("Stematizirani tekst")
print(stemmed_tokens)

### Lematizacija

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = []

for word in stemmed_tokens:
    lemmatized_tokens.append(lemmatizer.lemmatize(word))

print("Lematizirani tekst")
print(lemmatized_tokens)

### Najfrekventije riječi

In [None]:
word_count = Counter(lemmatized_tokens)
print("Najfrekventnije riječi ", word_count.most_common(30))

fd = FreqDist(lemmatized_tokens)
fd.plot(30, title='30 najfrekventnijih riječi')


### Najfrekventije riječi bez stop word-ova

In [None]:
from nltk.corpus import stopwords

nltk_stop_words = set(stopwords.words("english"))
tokens_without_stop_words = []

for word in lemmatized_tokens:
    if word not in nltk_stop_words:
        tokens_without_stop_words.append(word)

word_count_without_stop_words = Counter(tokens_without_stop_words)
print("Najfrekventije riječi bez stop word-ova")
print(word_count_without_stop_words.most_common(30))
fd = FreqDist(word_count_without_stop_words)
fd.plot(30, title='30 najfrekventijih riječi bez stop word-ova')

### Concordance 10 najčešćih riječi u tekstu

In [None]:
text = nltk.Text(tokens)

ten_most_commont_words = word_count_without_stop_words.most_common(10)
print(ten_most_commont_words)
for w in ten_most_commont_words:
    text.concordance(w[0])

### Kolokacije 10 najčešćih riječi

In [None]:
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, TrigramCollocationFinder
bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()

bigram_finder = BigramCollocationFinder.from_words(tokens)
bigram_finder.apply_freq_filter(2)
print("Bigrami")
print(bigram_finder.nbest(bigram_measures.likelihood_ratio, 10))
bigram_finder.apply_word_filter(lambda w: 'it' in w or 'be' in w or 'didn' in w or 'month' in w 
or 'croatian' in w or 'palma' in w or 'alen' in w or 'dinamo' in w)
print("Bigrami s filterom")
print(bigram_finder.nbest(bigram_measures.likelihood_ratio, 10))


trigram_finder = TrigramCollocationFinder.from_words(tokens)
trigram_finder.apply_freq_filter(2)
print("Trigrami")
print(trigram_finder.nbest(trigram_measures.likelihood_ratio, 10))
trigram_finder.apply_word_filter(lambda w: 'month'in w or 'croatian' in w)
print("Trigrami s filterom")
print(trigram_finder.nbest(trigram_measures.likelihood_ratio, 10))


### Leksicki diverzificitet i postotak najcesce rijeci u tekstu

In [None]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

def percentage(count, total):
    return 100 * count / total

print('Leksicki diverzificitet ', lexical_diversity(lemmatized_tokens))
print('Postotak rijeci halilov u tekstu ', round(percentage(lemmatized_tokens.count('halilov'), len(lemmatized_tokens)), 2))

### Korpus teksta Brown

In [None]:
from nltk.corpus import brown

print('Brown kategorije:\n', brown.categories())
genres = brown.categories()
words = ['croatian', 'player', 'deal', 'loan', 'barça', 'career', 'debut', 'palmas',  'birmingham']
most_frequent_words_in_brown = {}
for genre in genres:
    genre_text = brown.words(categories = genre)
    fdist = nltk.FreqDist(genre_text)
    for w in words:
        if fdist[w] > 0:
            lst = []
            lst.append(w)
            lst.append(fdist[w])
            most_frequent_words_in_brown[genre] = lst

print()
for key in most_frequent_words_in_brown:
    print(key, '->', most_frequent_words_in_brown[key])

In [None]:
from nltk.corpus import reuters

print('Reuters kategorije:\n', reuters.categories())
genres = reuters.categories()
words = ['croatian', 'player', 'deal', 'loan', 'barça', 'career', 'debut', 'palmas',  'birmingham']
most_frequent_words_in_reuters = {}
for genre in genres:
    genre_text = reuters.words(categories = genre)
    fdist = nltk.FreqDist(genre_text)
    for w in words:
        if fdist[w] > 0:
            lst = []
            lst.append(w)
            lst.append(fdist[w])
            most_frequent_words_in_reuters[genre] = lst

print()
for key in most_frequent_words_in_reuters:
    print(key, '->', most_frequent_words_in_reuters[key])

### Lingvističko stablo

In [None]:
sentence = 'He set a string of records in the two seasons he spent in Dinamo’s first team'

In [None]:
grammar_np = r"NP: {<DT>?<CD>?<NN|NNS>}" 
chunk_parser1 = nltk.RegexpParser(grammar_np)
sent_tokens1 = nltk.pos_tag(word_tokenize(sentence))

chunk_parser1.parse(sent_tokens1)

In [None]:
grammar_vp = r"VP: {<PRP>?<VBP|VBN|VBD>*<NP|NNP>*<IN>*<NNP>?}"
chunk_parser2 = nltk.RegexpParser(grammar_vp)
sent_tokens2 = nltk.pos_tag(word_tokenize(sentence))

chunk_parser2.parse(sent_tokens2)