In [35]:
# author: Sardar Jaf
#
# Generating statistics for a given corpus
# corpus structure can be based on directory structure as in:
# root
#   news
#     123.txt
#     ...
#   economic
#      123.txt
#      ...
#  other directories
# or based on file structure as in:
#  root
#    news.123.txt
#    ...
#    economics.456.txt
#    ....
#  OUTOUT
#sports:
    #Total documents: 1
    #Total sentences: 4
    #Total words: 170
    #Agv. sentence length: 42
    #Longest sentence: 24 (in document: sports)
    #Shortest sentence: 18 (in document: sports)

#economy:
    #Total documents: 1
    #Total sentences: 3
    #Total words: 88
    #Agv. sentence length: 29
    #Longest sentence: 33 (in document: economy)
    #Shortest sentence: 20 (in document: economy)
# ...

In [36]:
import os
import nltk
import codecs
import matplotlib.pyplot as plt

from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/sardar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [37]:
stats = {}
all_words={}

In [11]:
# reading files from a directory list and compute various statistics about the corpus
# this function works if the corpus structure is directory based, where different files 
# are placed in different directories and each directory represents a topic 


# Setting the path for the corpus directory
PATH = os.getcwd()
dir_path = PATH + "/corpus_root_directory" # this should point to the data files
dir_list = os.listdir(dir_path)

for dirs in dir_list:  
    
    # store some stats in a dictionary
    stats[dirs] = {}
    stats[dirs]['total_documents'] = 0
    stats[dirs]['total_sentences'] = 0
    stats[dirs]['total_words'] = 0
    
    max_sent_len = 0
    min_sent_len = 1000 # there should be no sentence > 1000 words
    
    all_words[dirs] = [] 
    for files in os.listdir(dir_path+"/"+dirs):
        file_input = codecs.open(dir_path+"/"+dirs+"/"+files, encoding='utf-8')
        text = file_input.read()
        file_input.close()
        
        result = text.split('\n')
        for r in result:
            if len(r) > 2:
                stats[dirs]['total_sentences'] += 1 # update the total_sentences value in the dictionary for a dir (topic)
                #if dirs == 'poem':
                    #print(r,len(result))
                words_tokens = word_tokenize(r)                
                all_words[dirs] += words_tokens
                stats[dirs]['total_words'] += (len(words_tokens)) # update total_words value in the dictionary for a dir (topic)
                sent_length = len(words_tokens)
                
                if sent_length >= max_sent_len:
                    stats[dirs]['max_sent_len'] = sent_length
                    stats[dirs]['max_sent_len_doc'] = dirs+"/"+files
                
                if sent_length <= min_sent_len:
                    min_sent_len = sent_length
                    stats[dirs]['min_sent_len'] = min_sent_len
                    stats[dirs]['min_sent_len_doc'] = dirs+"/"+files
               
        stats[dirs]['total_documents'] += 1
        

news 2285.txt


In [40]:
# Setting the path for the corpus directory
PATH = os.getcwd()
dir_path = PATH + "/data" # this should point to the data files
dir_list = os.listdir(dir_path)

# reading files from a directory list and compute various statistics about the corpus
# this function work on a corpus structure that is based on file names
# i.e., the topic name for a file is in the file name e.g., news.123.txt, economy.123.txt
for dirs in dir_list:  
    
    dir_name, file_name = dirs.split('.')[0], dirs.split('.')[1]+'.txt'
    # store some stats in a dictionary
    stats[dir_name] = {}
    stats[dir_name]['total_documents'] = 0
    stats[dir_name]['total_sentences'] = 0
    stats[dir_name]['total_words'] = 0
    
    max_sent_len = 0
    min_sent_len = 1000 # there should be no sentence > 1000 words
        
    all_words[dir_name] = [] 
    #for files in os.listdir(dir_path+"/"+dirs):
    file_input = codecs.open(dir_path+"/"+dirs, encoding='utf-8')
    text = file_input.read()
    file_input.close()
       
    result = text.split('\n')
    for r in result:
        if len(r) > 2:
            stats[dir_name]['total_sentences'] += 1 # update the total_sentences value in the dictionary for a dir (topic)
            #if dirs == 'poem':
                #print(r,len(result))
            words_tokens = word_tokenize(r)                
            all_words[dir_name] += words_tokens
            stats[dir_name]['total_words'] += (len(words_tokens)) # update total_words value in the dictionary for a dir (topic)
            sent_length = len(words_tokens)
            
            if sent_length >= max_sent_len:
                stats[dir_name]['max_sent_len'] = sent_length
                stats[dir_name]['max_sent_len_doc'] = dir_name
               
            if sent_length <= min_sent_len:
                min_sent_len = sent_length
                stats[dir_name]['min_sent_len'] = min_sent_len
                stats[dir_name]['min_sent_len_doc'] = dir_name
               
    stats[dir_name]['total_documents'] += 1

    

In [39]:
total_docs = 0
total_words = 0
total_sents = 0
for key, values in stats.items():
    #if values['total_sentences']:
    #print(key, values['total_documents'], values['total_sentences'], values['total_words'], 'Avg. word/sentence', )
    print("%s:" %key)
    print("\tTotal documents: %d" %values['total_documents'])
    print("\tTotal sentences: %d" %values['total_sentences'])
    print("\tTotal words: %d" %values['total_words'])
    print("\tAgv. sentence length: %d" %(values['total_words'] / values['total_sentences']))
    print("\tLongest sentence: %d (in document: %s)"% (values['max_sent_len'], values['max_sent_len_doc']))
    print("\tShortest sentence: %d (in document: %s)\n"% (values['min_sent_len'], values['min_sent_len_doc']))
        



