## Counting Paragraphs

In [1]:
import re
import pandas as pd
import os
import time
from tqdm.notebook import tqdm
import unidecode
import numpy as np

from preprocessing_functions import *

In [2]:
def count_pars(article):
    """count number of paragraphs in an article"""
    count = 0 # initialise count at 0

    # split article into paragraphs (by using '\n' as end of paragraph)
    paragraphs = article.splitlines()
    for paragraph in paragraphs:
        # if paragraph empty skip
        if not paragraph:
            continue

        # check if paragraph > 2
        length = len(paragraph.split(' '))
        if length >= 2:
            count += 1
    
    return count

In [3]:
# language = 'fr' #'fr'
lang_list = ['en', 'fr']
indir = f'../../../../data/' # path to files that are not in directory

In [4]:
for lang in lang_list:     
    # set up df structure
    df = pd.DataFrame(columns = ['article_id', 'title', 'text'])
    
    # path to directory with article .csv files
    inputfp = os.path.join(indir, f'{lang}wiki/')
    
    # loop over csv files
    for file in os.listdir(inputfp):
        fp = os.path.join(inputfp, file)
        df_temp = pd.read_csv(fp)
        df = pd.concat([df, df_temp]) # add into one single df
        
    # loop over df to get article length
    word_counts = [len(text.split()) for text in df.text]
    print(f"The average article in {lang}wiki is {round(np.mean(word_counts))} words.")
    print(f"The {lang}wiki corpus has a total of {sum(word_counts)} words.")

    
    # loop over articles
    total_count = 0
    for article in tqdm(df.text, total = len(df.text), desc = "Articles processed"): 
        total_count += count_pars(article)
    
    print(f"{lang}wiki consists of {len(df)} articles. \nMaking up a total of {total_count} paragraphs where a collocation could occur.")

The average article in enwiki is 1142 words.
The enwiki corpus has a total of 580791582 words.


Articles processed:   0%|          | 0/508671 [00:00<?, ?it/s]

enwiki consists of 508671 articles. 
Making up a total of 10436996 paragraphs where a collocation could occur.
The average article in frwiki is 1042 words.
The frwiki corpus has a total of 286122298 words.


Articles processed:   0%|          | 0/274578 [00:00<?, ?it/s]

frwiki consists of 274578 articles. 
Making up a total of 6005355 paragraphs where a collocation could occur.


enwiki:  
Total number of paragraphs where a collocation could occur is: **10,436,996**  
Total number of articles: **508,671**  
Average article length: **1142**  
Total number of words: **580,791,582**  

frwiki:  
Total number of paragraphs where a collocation could occur is: **6,005,355**  
Total number of articles: **274,578**  
Average article length: **1042**  
Total number of words: **6005355**  