## Preprocess and Combine whole data

In [4]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.2.0-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2023.4-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.0-cp312-cp312-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.5 MB 660.6 kB/s eta 0:00:18
    --------------------------------------- 0.2/11.5 MB 2.4 MB/s eta 0:00:05
   ----- ---------------------------------- 1.7/11.5 MB 11.9 MB/s eta 0:00:01
   ---------- ----------------------------- 2.9/11.5 MB 17.1 MB/s eta 0:00:01
   ------------ --------------------------- 3.7/11.5 MB 18.3 MB/s eta 0:00:01
   --------------- ------------------------ 4.5/11.5 MB 18.0 MB/s eta 0:00:01
   ---------------- ------------

In [1]:
import os
import numpy as np

# Define the path to the train folder
train_folder_path = './db/train/'

# Get a list of all filenames in the train folder
filenames = os.listdir(train_folder_path)

# Filter out only the files (excluding directories)
filenames = [filename for filename in filenames if os.path.isfile(
    os.path.join(train_folder_path, filename))]

# Convert the list of filenames to a NumPy array
filenames_array = np.array(filenames)

# Display the filenames array
print(filenames_array)

['Business Documents.csv' 'Creative Documents.csv'
 'Educational Documents.csv' 'Financial Documents.csv'
 'Govt Documents.csv' 'Legal Documents.csv' 'Medical Documents.csv'
 'News Documents.csv' 'Scientific Doc2.csv' 'Scientific Documents.csv'
 'Technical Documents.csv']


In [2]:
file_names_without_extension = [filename.split(
    '.')[0] for filename in filenames_array]
file_names_without_extension

['Business Documents',
 'Creative Documents',
 'Educational Documents',
 'Financial Documents',
 'Govt Documents',
 'Legal Documents',
 'Medical Documents',
 'News Documents',
 'Scientific Doc2',
 'Scientific Documents',
 'Technical Documents']

In [6]:
import pandas as pd

# Assuming filenames_array is defined somewhere in your code
df = pd.read_csv('./db/train/' + filenames_array[0], encoding='ISO-8859-1')
df

Unnamed: 0,content,category
0,Message-ID: <24216240.1075855687451.JavaMail.e...,Business Correspondence
1,Message-ID: <25140503.1075855687800.JavaMail.e...,Business Correspondence
2,Message-ID: <19034252.1075855687825.JavaMail.e...,Business Correspondence
3,Message-ID: <719350.1075855687850.JavaMail.eva...,Business Correspondence
4,Message-ID: <10523086.1075855687873.JavaMail.e...,Business Correspondence
...,...,...
1013,Message-ID: <21543395.1075855374340.JavaMail.e...,Business Correspondence
1014,Message-ID: <25363451.1075855374674.JavaMail.e...,Business Correspondence
1015,"Subject: Inviting quotation\r\nDear Sir,\r\nWe...",Business Correspondence
1016,To whom it may concern:\nElon Musks takeover ...,Business Correspondence


In [7]:
from sklearn.model_selection import train_test_split

import math
import pickle
from nltk.stem.porter import PorterStemmer
import string
from nltk.corpus import stopwords
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
# stopwords.words('english')
# string.punctuation
ps = PorterStemmer()
# ps.stem('worries')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pulki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pulki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [9]:
def preprocesse_df(df):
    df = df.dropna()
    df.duplicated().sum()
    df = df.drop_duplicates(keep='first')
    df['transformed_content'] = df['content'].apply(transform_text)
    # Remove duplicate rows
    
    processed_df = df[df['content'].str.len() >= 1000]

    processed_df= processed_df[['transformed_content', 'category']]

    processed_df.drop_duplicates(inplace=True)
    return processed_df

In [10]:
def clean_df(df, label):
    # Remove columns with empty values
    df.dropna(axis=1, how='all', inplace=True)

    # Remove rows with NaN values
    df.dropna(axis=0, how='any', inplace=True)

    # Remove duplicate rows
    df.drop_duplicates(inplace=True)

    # Set 'category' column with the specified label
    df['category'] = label

    # Rename 'Content' column to lowercase 'content'
    df.rename(columns={'Content': 'content',
              'Category': 'category'}, inplace=True)

    return df

In [11]:
df = pd.read_csv('./db/train/'+filenames_array[4], encoding='ISO-8859-1')
new_df = clean_df(df, file_names_without_extension[4] )
new_df = preprocesse_df(new_df)
new_df =pd.concat([df, new_df], ignore_index=True)
new_df

Unnamed: 0,content,category,transformed_content
0,List of Documents required as ID and Address p...,Govt Documents,
1,The department serves as the central hub for p...,Govt Documents,
2,Documents List of documents accepted\r\nProof ...,Govt Documents,
3,Interim Budget 2024-2025\nSpeech of\nNirmala S...,Govt Documents,
4,International Cooperation (Chapter 11)\n1.10 T...,Govt Documents,
5,"The Ministry of Personnel, Public Grievances a...",Govt Documents,
6,The Common Services Centers Scheme: Background...,Govt Documents,
7,Role of various Agencies\n4.1 Service Centre A...,Govt Documents,
8,Ministry of Education Demand No. 26\nDepartmen...,Govt Documents,
9,The Common Services Centers Scheme: Background...,Govt Documents,


In [12]:
import pandas as pd

full_df2 = pd.DataFrame()  # Create an empty DataFrame

n = len(filenames_array)
# n = 3
for i in range(0, n):
    print(i, file_names_without_extension[i], 'started')
    df = pd.read_csv('./db/train2/' + filenames_array[i], encoding='ISO-8859-1')
    new_df = clean_df(df, file_names_without_extension[i])
    new_df = preprocesse_df(new_df)

    # Assign the result back to full_df
    full_df2 = pd.concat([full_df2, new_df], ignore_index=True)
    print(i, file_names_without_extension[i],
          'successful ------------------------------')

0 Business Documents started
0 Business Documents successful ------------------------------
1 Creative Documents started
1 Creative Documents successful ------------------------------
2 Educational Documents started
2 Educational Documents successful ------------------------------
3 Financial Documents started
3 Financial Documents successful ------------------------------
4 Govt Documents started
4 Govt Documents successful ------------------------------
5 Legal Documents started
5 Legal Documents successful ------------------------------
6 Medical Documents started
6 Medical Documents successful ------------------------------
7 News Documents started
7 News Documents successful ------------------------------
8 Scientific Doc2 started
8 Scientific Doc2 successful ------------------------------
9 Scientific Documents started
9 Scientific Documents successful ------------------------------
10 Technical Documents started
10 Technical Documents successful ------------------------------


In [13]:
unique_categories = full_df2['category'].unique()
print(unique_categories)

['Business Documents' 'Financial Documents' 'Govt Documents'
 'Legal Documents' 'Medical Documents' 'News Documents' 'Scientific Doc2'
 'Scientific Documents' 'Technical Documents']


In [14]:
category_counts = full_df2['category'].value_counts()
print(category_counts)

category
News Documents          3475
Medical Documents        491
Business Documents       409
Legal Documents           88
Financial Documents       67
Govt Documents            19
Technical Documents       10
Scientific Documents       7
Scientific Doc2            1
Name: count, dtype: int64


In [17]:
full_df.to_csv('full_df2.csv', index=False)