# Beginner's excercises

In [1]:
import cltk

In [2]:
from cltk.corpus.utils.importer import CorpusImporter

In [3]:
from cltk.tokenize.indian_tokenizer import indian_punctuation_tokenize_regex as i_word

In [4]:
import os

# Lexical Diversity

### Set path of hindi text to be used in this excercise

In [5]:
hindi_text_path = '/home/ags/cltk_data/hindi_text_ltrc/miscellaneous/gandhi/main.txt' 

### Open and read hindi text

In [6]:
hindi_text = open(hindi_text_path, 'r').read()

In [7]:
hindi_text[:200]

'10्र प्रति ा वापस नहीं ली जातीएक बार कस्तुरबा गांधी बहुत बीमार हो गईं। जलर्  चिकित्सा से उन्हें कोई लाभ नहीं हुआ। दूसरे उपचार किये गये। उनमे भी सफलता नहीं मिली। अंत में गांधीजी ने उन्हें नमक और दाल छो'

### Tokenize the hindi text

In [8]:
hindi_text_tokenize = i_word(hindi_text)

In [9]:
print(hindi_text_tokenize[0:100])

['10्र', 'प्रति', 'ा', 'वापस', 'नहीं', 'ली', 'जातीएक', 'बार', 'कस्तुरबा', 'गांधी', 'बहुत', 'बीमार', 'हो', 'गईं', '।', 'जलर्', 'चिकित्सा', 'से', 'उन्हें', 'कोई', 'लाभ', 'नहीं', 'हुआ', '।', 'दूसरे', 'उपचार', 'किये', 'गये', '।', 'उनमे', 'भी', 'सफलता', 'नहीं', 'मिली', '।', 'अंत', 'में', 'गांधीजी', 'ने', 'उन्हें', 'नमक', 'और', 'दाल', 'छोडने', 'की', 'सलाह', 'दी', '।', 'परन्तु', 'इसके', 'लिए', 'बा', 'तैयार', 'नहीं', 'हुईं', '।', 'गांधीजी', 'ने', 'बहुत', 'समझाया', '.', '\nपोथियों', 'से', 'प्रमाण', 'पढकर', 'सुनाये', '.', 'लेकर', 'सब', 'व्यर्थ', '।', 'बा', 'बोलीं', '.', '"', 'कोई', 'आपसे', 'कहे', 'कि', 'दाल', 'और', 'नमक', 'छोड', 'दो', 'तो', 'आप', 'भी', 'नहीं', 'छोडेंगे', '।', '"', 'गांधीजी', 'ने', 'तुरन्त', 'प्रसÙ', 'होकर', 'कहा', '.', '"', '\nतुम']


### Count total number of words in read hindi text 

In [10]:
total_num_of_words = len(hindi_text_tokenize)
print(total_num_of_words)

18445


### Count unique number of words

In [11]:
unique_num_of_words = len(set(hindi_text_tokenize))
print(unique_num_of_words)

3101


### Count Lexical Diversity

In [12]:
lexical_diversity_of_text = unique_num_of_words / total_num_of_words
print(lexical_diversity_of_text)

0.1681214421252372


# Word and sentence length

### imports

In [13]:
from bs4 import BeautifulSoup
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer


### Import latin text

In [14]:
latin_corpus_importer = CorpusImporter('latin')
latin_corpus_importer.import_corpus('latin_models_cltk')

latin_text_path = '/home/ags/cltk_data/latin/text/latin_text_perseus/Bede/opensource/bede.hega_lat.xml'

### Open and read the text

In [20]:
latin_text = open(latin_text_path, 'r').read()

soup = BeautifulSoup(latin_text, "html5lib")
latin_text = soup.get_text()
print (latin_text[:200])

%PersProse;
]>




Historiam ecclesiasticam gentis Anglorum
The Venerable Bede
Charles Plummer
&responsibility;
&fund.NEH;

&Perseus.publish;



The Venerable Bede
Historiam ecclesiasticam gentis Angl


### create sentence tokenizer

In [16]:
sentence_tokenizer = TokenizeSentence("latin")

### Create a list of all the tokenized sentences

In [17]:
sentence_list = sentence_tokenizer.tokenize_sentences(latin_text)

### Create word tokenizer and count no. of words in each tokenized sentence 

In [18]:
word_tokenizer = WordTokenizer('latin')

for sentence in sentence_list:
    print(sentence)
    words = [x for x in word_tokenizer.tokenize(sentence)]
    print("No. of words in this sentence: ", len(words))

%PersProse;
]>




Historiam ecclesiasticam gentis Anglorum
The Venerable Bede
Charles Plummer
&responsibility;
&fund.NEH;

&Perseus.publish;



The Venerable Bede
Historiam ecclesiasticam gentis Anglorum
Charles Plummer

Oxfordii
e typographeo Clarendoniano
1896


















Latin
English
Greek
Italian
French
German


     
    
      1 Jul 2004
        DMed.
No. of words in this sentence:  49
$Log:
No. of words in this sentence:  2
bede.hega_lat.xml,v $
Revision 1.2  2010-06-29 21:31:50  student
fixing place tags - AD

Revision 1.1  2009/10/08 19:14:06  rsingh04
began reorganizing texts module by collection.
No. of words in this sentence:  36
created separate work directory in texts module to keep hopper files separate from in progress files

Revision 1.5  2006/05/05 17:38:42  packel
fixed date tags

Revision 1.4  2006/02/10 20:50:46  packel
places/dates now actually tagged

Revision 1.3  2004/11/22 15:54:26  mimno
added div

Revision 1.2  2004/06/02 13:20:05  amahoney
correct C

### Count total words and print number of characters in each tokenized word

In [19]:
total_words = word_tokenizer.tokenize(latin_text)
# print(total_words)

for word in total_words:
    print(len(word))

10
1
1
1
9
14
6
8
3
9
4
7
7
1
14
1
1
8
1
1
15
1
3
9
4
9
14
6
8
7
7
8
1
11
13
4
5
7
5
6
3
6
5
3
1
3
4
4
1
4
1
19
1
7
3
3
10
2
1
2
1
2
7
6
5
4
1
2
7
3
3
10
2
1
2
1
2
8
4
3
12
5
6
2
9
3
1
7
8
4
9
2
5
6
2
4
6
5
8
4
2
8
5
7
3
3
10
2
1
2
1
2
6
5
4
4
7
3
3
10
2
1
2
1
2
6
12
3
8
6
7
3
3
10
2
1
2
1
2
5
5
3
7
3
3
10
2
1
2
1
2
8
7
3
7
2
6
10
3
13
4
9
5
7
7
2
9
9
6
8
14
1
4
9
3
2
8
1
4
2
3
13
1
5
8
1
12
4
11
1
3
1
2
5
2
8
2
9
9
1
2
4
2
15
2
7
2
7
10
12
1
5
4
7
4
12
9
1
3
3
5
9
10
7
6
5
7
10
1
5
5
9
7
6
4
6
1
2
6
7
6
7
10
1
5
10
8
1
4
4
8
2
5
4
7
1
2
9
4
2
4
3
2
7
1
5
7
10
10
1
3
4
10
2
6
1
11
10
2
4
7
4
6
9
4
6
3
2
9
1
4
10
2
10
2
1
4
4
2
3
5
4
10
1
10
1
4
5
2
6
14
12
1
9
9
2
8
4
5
2
3
1
6
2
8
6
9
10
1
2
9
5
7
6
9
9
1
3
11
1
2
5
2
3
1
4
7
1
3
4
1
3
7
11
4
10
5
9
10
9
9
1
6
4
6
10
9
1
8
8
6
1
6
4
5
5
7
8
5
7
1
7
4
15
1
3
3
5
11
1
7
1
3
2
8
12
1
6
8
8
13
2
8
6
1
5
13
5
13
1
10
1
10
5
1
4
2
4
12
9
1
3
5
2
9
5
10
1
10
5
5
8
5
5
1
3
10
10
1
3
8
10
10
1
2
2
4
2
3
1
4
7
5
10
1
3
10
7
1
12
9
11
9
1
4
8
7
