<a href="https://colab.research.google.com/github/souro/table_to_text/blob/main/wiki_honorifics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import json
import re
import os
import pandas as pd

In [4]:
def is_unicode_format(value):
    return bool(re.search(r'\\u[0-9A-Fa-f]{4}', value))

In [5]:
def convert_to_bengali(value):
    return value.encode().decode('unicode_escape')

In [6]:
def process_json_object(json_obj):
    for key, value in json_obj.items():
        if isinstance(value, str) and is_unicode_format(value):
            json_obj[key] = convert_to_bengali(value)
        elif isinstance(value, dict):
            process_json_object(value)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    process_json_object(item)
    return json_obj

In [4]:
# input_file = 'sample_data/wiki_00'
# output_file = 'sample_data/wiki_bn1.json'

In [15]:
input_directory = 'sample_data/bn/'
output_directory = 'sample_data/bn_prcsd/'

In [8]:
pronouns = ["সে", "তারা", "তাঁরা", "ও", "ওরা", "তিনি", "তাহারা", "তার", "তাঁর", "তাদের", "তাঁদের", "ওর", "ওদের", "তাহার", "তাহাদের"]

def count_pronouns(text, pronouns):
    counts = {pronoun: text.count(pronoun) for pronoun in pronouns}
    return counts

In [16]:
data = []
for filename in os.listdir(input_directory):
    # if filename.endswith('.json'):
    filepath = os.path.join(input_directory, filename)
    input_file = filepath
    output_file = os.path.join(output_directory, filename)
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            json_obj = json.loads(line)

            title = json_obj.get('title', '')
            text = json_obj.get('text', '')

            pronoun_counts = count_pronouns(text, pronouns)
            pronoun_counts['title'] = title
            data.append(pronoun_counts)

            processed_obj = process_json_object(json_obj)
            json.dump(processed_obj, outfile, ensure_ascii=False)
            outfile.write('\n')

In [None]:
# !zip -r sample_data/bn_prcsd.zip sample_data/bn_prcsd/

In [22]:
print(data[:5])

[{'সে': 5, 'তারা': 0, 'তাঁরা': 0, 'ও': 7, 'ওরা': 0, 'তিনি': 10, 'তাহারা': 0, 'তার': 9, 'তাঁর': 0, 'তাদের': 0, 'তাঁদের': 0, 'ওর': 0, 'ওদের': 0, 'তাহার': 0, 'তাহাদের': 0, 'title': 'প্যাটসি ক্যাল্টন'}, {'সে': 0, 'তারা': 0, 'তাঁরা': 0, 'ও': 0, 'ওরা': 0, 'তিনি': 0, 'তাহারা': 0, 'তার': 0, 'তাঁর': 0, 'তাদের': 0, 'তাঁদের': 0, 'ওর': 0, 'ওদের': 0, 'তাহার': 0, 'তাহাদের': 0, 'title': 'চর গাজী ইউনিয়ন'}, {'সে': 2, 'তারা': 0, 'তাঁরা': 0, 'ও': 2, 'ওরা': 0, 'তিনি': 4, 'তাহারা': 0, 'তার': 1, 'তাঁর': 0, 'তাদের': 0, 'তাঁদের': 0, 'ওর': 0, 'ওদের': 0, 'তাহার': 0, 'তাহাদের': 0, 'title': 'স্টিফেন ডে (ব্রিটিশ রাজনীতিবিদ)'}, {'সে': 2, 'তারা': 0, 'তাঁরা': 0, 'ও': 2, 'ওরা': 0, 'তিনি': 1, 'তাহারা': 0, 'তার': 1, 'তাঁর': 0, 'তাদের': 0, 'তাঁদের': 0, 'ওর': 0, 'ওদের': 0, 'তাহার': 0, 'তাহাদের': 0, 'title': 'মার্ক হান্টার (রাজনীতিবিদ)'}, {'সে': 3, 'তারা': 1, 'তাঁরা': 0, 'ও': 5, 'ওরা': 0, 'তিনি': 0, 'তাহারা': 0, 'তার': 1, 'তাঁর': 0, 'তাদের': 0, 'তাঁদের': 0, 'ওর': 0, 'ওদের': 0, 'তাহার': 0, 'তাহাদের': 0, 'title': 'কুষ্টিয়া

In [20]:
freq_output_file = 'sample_data/output_bn.csv'

In [21]:
df = pd.DataFrame(data)

for pronoun in pronouns:
    if pronoun not in df.columns:
        df[pronoun] = 0

df = df[['title'] + pronouns]

df.to_csv(freq_output_file, index=False, encoding='utf-8')

In [19]:
df.head()

Unnamed: 0,title,সে,তারা,তাঁরা,ও,ওরা,তিনি,তাহারা,তার,তাঁর,তাদের,তাঁদের,ওর,ওদের,তাহার,তাহাদের
0,প্যাটসি ক্যাল্টন,5,0,0,7,0,10,0,9,0,0,0,0,0,0,0
1,চর গাজী ইউনিয়ন,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,স্টিফেন ডে (ব্রিটিশ রাজনীতিবিদ),2,0,0,2,0,4,0,1,0,0,0,0,0,0,0
3,মার্ক হান্টার (রাজনীতিবিদ),2,0,0,2,0,1,0,1,0,0,0,0,0,0,0
4,কুষ্টিয়া পৌর ভবন,3,1,0,5,0,0,0,1,0,0,0,0,0,0,0


In [5]:
!apt-get update # Update apt-get repository.
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
!tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
!pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

!ls

# # Initialize findspark
# import findspark
# findspark.init()

# # Create a PySpark session
# from pyspark.sql import SparkSession
# spark = SparkSession.builder.master("local[*]").getOrCreate()
# spark

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.81)] [Connected to cloud.r-                                                                                                    Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.81)] [Connected to cloud.r-                                                                                                    Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                                                                    Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 http://s

In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Ign:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy Release
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
62 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/S

In [3]:
import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession

In [4]:
from pyspark import SparkContext
num_of_th = 48;
repartition_size = num_of_th*4;
chunk_size = 1000000
sc = SparkContext(master = "local[20]").getOrCreate()
spark = SparkSession(sc)

In [5]:
spark

In [13]:
!bunzip2 -d sample_data/bnwiki-latest-pages-articles.xml.bz2


bunzip2: Compressed file ends unexpectedly;
	perhaps it is corrupted?  *Possible* reason follows.
bunzip2: No such file or directory
	Input file = sample_data/bnwiki-latest-pages-articles.xml.bz2, output file = sample_data/bnwiki-latest-pages-articles.xml

It is possible that the compressed file(s) have become corrupted.
You can use the -tvv option to test integrity of such files.

You can use the `bzip2recover' program to attempt to recover
data from undamaged sections of corrupted files.

bunzip2: Deleting output file sample_data/bnwiki-latest-pages-articles.xml, if it exists.


In [14]:
file_rdd = spark.read.text("sample_data/bnwiki-latest-pages-articles.xml", wholetext=False)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/sample_data/bnwiki-latest-pages-articles.xml.