### Objective
To convert the raw data to csv format for easy use

In [1]:
import os
import pandas as pd
import numpy  as np
from tqdm import tqdm

In [2]:
DATA_DIR = os.path.abspath('./dataset/20_news')
OUTPUT_DIR = os.path.join('./dataset/output')


### Converting raw data to csv format
Data Format: id, class, text

In [3]:
RAW_DATA_DIR = DATA_DIR
OUTPUT_FILENAME = 'data.csv'

In [4]:
classes = os.listdir(RAW_DATA_DIR)
columns = ['id', 'class', 'text']

In [5]:
data_dict = {
    'id': [],
    'class': [],
    'text': []
}

In [6]:
# Function to try multiple encodings
def try_open_file(file_path):
    encodings = ['utf-8', 'ISO-8859-1', 'latin-1', 'cp1252']
    for encoding in encodings:
        try:
            with open(file_path, encoding=encoding) as content:
                return content.read().strip()
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"Error reading {file_path} with encoding {encoding}: {e}")
            break
    return None

In [7]:
# Iterate over all classes (categories)
for label in classes:
    label_path = os.path.join(RAW_DATA_DIR, label)

    # Check if label_path is a valid directory
    if os.path.isdir(label_path):
        for f in tqdm(os.listdir(label_path)):
            id_ = f.split('.')[0]

            try:
                file_path = os.path.join(label_path, f)

                # Check if the file exists
                if not os.path.exists(file_path):
                    print(f"File does not exist: {file_path}")
                    continue

                # Attempt to read the file using multiple encodings
                text = try_open_file(file_path)
                
                # Skip the file if it couldn't be opened
                if text is None:
                    print(f"Failed to decode {file_path}")
                    continue
                
                # Add the data to the dictionary
                data_dict['id'].append(id_)
                data_dict['class'].append(label)
                data_dict['text'].append(text)

            except Exception as e:
                print(f"Failed to load {label}/{id_}: {e}")

100%|██████████| 100/100 [00:00<00:00, 118.93it/s]
100%|██████████| 95/95 [00:01<00:00, 91.37it/s] 
100%|██████████| 417/417 [00:07<00:00, 53.19it/s]
100%|██████████| 209/209 [00:03<00:00, 53.05it/s]
100%|██████████| 142/142 [00:02<00:00, 57.83it/s]
100%|██████████| 500/500 [00:05<00:00, 87.32it/s] 
100%|██████████| 85/85 [00:00<00:00, 91.72it/s] 
100%|██████████| 154/154 [00:01<00:00, 78.55it/s]
100%|██████████| 500/500 [00:05<00:00, 87.92it/s] 
100%|██████████| 31/31 [00:00<00:00, 48.90it/s]
100%|██████████| 229/229 [00:03<00:00, 60.67it/s]
100%|██████████| 102/102 [00:00<00:00, 135.00it/s]
100%|██████████| 111/111 [00:00<00:00, 137.76it/s]
100%|██████████| 500/500 [00:05<00:00, 94.69it/s] 
100%|██████████| 500/500 [00:05<00:00, 96.48it/s] 
100%|██████████| 253/253 [00:04<00:00, 53.72it/s]
100%|██████████| 500/500 [00:05<00:00, 83.62it/s]
100%|██████████| 100/100 [00:01<00:00, 86.54it/s]
100%|██████████| 150/150 [00:01<00:00, 96.93it/s] 
100%|██████████| 212/212 [00:01<00:00, 123.57i

In [8]:
# length of data dictionary
print(len(data_dict['id']))

4890


In [9]:
df = pd.DataFrame(data_dict, columns=columns, dtype=object)

In [10]:
nan_value = float("NaN")
df.replace("", nan_value, inplace=True)
df.dropna(subset=["text"], inplace=True)

In [11]:
df.reset_index(inplace=True, drop=True)
df['id'] = df.index
df.head()

Unnamed: 0,id,class,text
0,0,Agriculture,﻿काठमाडौंमा पहिलो पटक स्ट्रबेरीको व्यवसायिक खे...
1,1,Agriculture,"जिल्लाका किसानले लगाएको अदुवामा गानो कुहिने, ग..."
2,2,Agriculture,काभ्रेपलाञ्चोकमा कृषकले एसआरआई प्रविधिमा गरेको...
3,3,Agriculture,राजधानीमा यतिबेला तरकारीको मूल्य आकासिएको छ। क...
4,4,Agriculture,पाल पोल्ट्री तथा लाइभस्टक क्षेत्रको समग्र विका...


In [12]:
df.loc[5]

id                                                       5
class                                          Agriculture
text     माग अनुसारको आपूर्ति नभएपछि तरकारीको मुल्य दोब...
Name: 5, dtype: object

In [13]:
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Agriculture,﻿काठमाडौंमा पहिलो पटक स्ट्रबेरीको व्यवसायिक खे...
1,Agriculture,"जिल्लाका किसानले लगाएको अदुवामा गानो कुहिने, ग..."
2,Agriculture,काभ्रेपलाञ्चोकमा कृषकले एसआरआई प्रविधिमा गरेको...
3,Agriculture,राजधानीमा यतिबेला तरकारीको मूल्य आकासिएको छ। क...
4,Agriculture,पाल पोल्ट्री तथा लाइभस्टक क्षेत्रको समग्र विका...


#### Merging Classes

In [14]:
# Define the mapping for merging classes
class_mapping = {
    'Bank': 'Economics',
    'Business': 'Economics',
    'Economy': 'Economics',
    'Employment': 'Economics',
    'Sports': 'Leisure',
    'Entertainment': 'Leisure',
    'Politics': 'Politics & Society',
    'Opinion': 'Politics & Society',
    'Society': 'Politics & Society',
    'Interview': 'Politics & Society',
    'World': 'World & Migration',
    'Migration': 'World & Migration',
    'Technology': 'Technology & Innovation',
    'Automobiles': 'Technology & Innovation',
    'Agriculture': 'Technology & Innovation',
    'Tourism': 'Technology & Innovation',
    'Literature': 'Culture & Literature',
    'Blog': 'Culture & Literature',
    'Education': 'Education',
    'Health': 'Health'
}


df['class'] = df['class'].map(class_mapping).fillna(df['class'])  # Keep original class if not in the mapping

In [15]:
# %% [markdown]
# ### Update Classes Based on the Mapping

# Apply the class mapping to the 'class' column

# %% [markdown]
# ### Save the New CSV

# Set the new output file name
MERGED_OUTPUT_FILENAME = 'merged_data.csv'

# Save the updated dataframe to a new CSV file
df.to_csv(os.path.join(OUTPUT_DIR, MERGED_OUTPUT_FILENAME), index=False)



#### Check the distribution of the newly merged classes

In [16]:
print("New class distribution after merging:")
print(df['class'].value_counts())

New class distribution after merging:
class
Politics & Society         1482
Economics                  1213
Leisure                    1000
Technology & Innovation     445
World & Migration           323
Culture & Literature        311
Education                    85
Health                       31
Name: count, dtype: int64


### Check for Duplicates and Missing Values

In [17]:
# Print the number of duplicated rows and missing values
print(f"Duplicated rows: {df.duplicated().sum()}")
print(f"Null values: {df.isnull().sum().sum()}")

Duplicated rows: 33
Null values: 0
