In [66]:
import csv
import pandas as pd
from tqdm import tqdm

# Placeholder for file paths
input_file = '../classi_bank/eobr_enrichhed.csv'  # Replace with your input file path
output_file = '../classi_bank/final_output.csv'  # Replace with your desired output file path

# Define column names for the structured data
columns = ['id', 'DATA', 'name_tu_type', 'date', 'name_ru_org']

# Function to map integers to column names
def get_field_by_input(user_input):
    field_mapping = {
        1: 'id',
        2: 'DATA',
        3: 'name_tu_type',
        4: 'date',
        5: 'name_ru_org'
    }
    return field_mapping.get(user_input, None)

# Initialize the output file with the header
with open(output_file, 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f, delimiter=';')
    writer.writerow(columns)  # Write header row

# Process the file line by line
records = []  # List to store 100 processed records
current_record = {col: '' for col in columns}  # Initialize an empty record

with open(input_file, 'r', encoding='utf-8') as file:
    total_lines = sum(1 for _ in file)  # Count total lines for progress bar
    file.seek(0)  # Reset file pointer to start

    # Skip the header row
    next(file)

    # Use tqdm to track progress
    for i, line in enumerate(tqdm(file, total=total_lines, desc="Processing lines")):
        line = line.strip()
        fields = line.split(';')  # Split the line by semicolon

        # Process each field in the line
        for field in fields:
            if field.strip():
                print(f"\nField: {field}")
                print("Which field does this belong to?")
                print("1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org")

                # Get user input
                try:
                    user_input = int(input("Enter your choice (1-5): ").strip())
                    field_name = get_field_by_input(user_input)

                    if field_name:
                        # Append the value to the corresponding field in the record
                        if field_name == 'id' and current_record['id']:
                            # If `id` is already filled, save the current record and start a new one
                            records.append(current_record)
                            if len(records) >= 100:
                                # Save to file if we have 100 records
                                df_chunk = pd.DataFrame(records, columns=columns)
                                df_chunk.to_csv(output_file, mode='a', index=False, header=False, sep=';', encoding='utf-8')
                                records = []  # Clear records after writing
                            current_record = {col: '' for col in columns}  # Start a new record

                        current_record[field_name] += (field if not current_record[field_name] else ' ' + field)
                    else:
                        print("Invalid input. Skipping this field.")
                except ValueError:
                    print("Invalid input. Please enter an integer between 1 and 5.")
                    continue

    # Save the last record
    if any(current_record.values()):
        records.append(current_record)

    # Save any remaining records
    if records:
        df_chunk = pd.DataFrame(records, columns=columns)
        df_chunk.to_csv(output_file, mode='a', index=False, header=False, sep=';', encoding='utf-8')

print("Processing complete. Data saved to:", output_file)


Processing lines:   0%|                                                                    | 0/15732667 [00:00<?, ?it/s]


Field: "23121902809485"
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  1



Field: Выписка с индивидуального пенсионного счета
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  2



Field: Запрос
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  3



Field: "2023/12/19 08:06:20.490209000"
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  4



Field: "Акционерное общество ""Единый накопительный пенсионный фонд"""
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  5


Processing lines:   0%|                                                     | 1/15732667 [00:40<177963:31:23, 40.72s/it]


Field: "23121902809482"
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  1



Field: HIDDEN первую очередь пустить автобус ,  оборудовать остановки
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  2



Field: Заявление
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  3



Field: "2023/12/19 08:03:11.131384000"
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  4



Field: "Коммунальное государственное учреждение ""Аппарат Акима города Алматы"""
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  5


Processing lines:   0%|                                                     | 2/15732667 [00:54<107565:25:45, 24.61s/it]


Field: "23121902809480"
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  1



Field: Заявление HIDDEN .  о предоставлении информации
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  2



Field: Заявление
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  3



Field: "2023/12/19 08:01:57.882288000"
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  4



Field: Костанайский областной филиал АО «ЕНПФ»
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  5


Processing lines:   0%|                                                      | 3/15732667 [01:00<71472:06:59, 16.35s/it]


Field: "23121902809479"
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  1



Field: ** л ауыз судың бұзылып , шірігені бойынша арыздана отырып ,  заңды түрде шара қолдануды сұрайды . 
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  2



Field: Заявление
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  3



Field: "2023/12/19 08:01:52.343114000"
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  4



Field: Атырауское городское управление санитарно-эпидемиологического контроля
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Enter your choice (1-5):  5


Processing lines:   0%|                                                      | 4/15732667 [01:10<59906:48:22, 13.71s/it]


Field: "23121902809478"
Which field does this belong to?
1 -> id, 2 -> DATA, 3 -> name_tu_type, 4 -> date, 5 -> name_ru_org


Processing lines:   0%|                                                   | 4/15732667 [19:23<1270676:55:21, 290.76s/it]

KeyboardInterrupt



In [7]:
import csv
import re
from tqdm import tqdm

# Placeholder for input and output file paths
input_file = "../classi_bank/eobr_enrichhed.csv"  # Replace with your input file path
output_file = "../classi_bank/final_output.csv"  # Replace with your desired output file path

# Define column names for the structured data
columns = ['id', 'DATA', 'name_tu_type', 'date', 'name_ru_org']

# Function to validate ID
def is_valid_id(field):
    return bool(re.match(r'^"\d{14}"$', field.strip()))

# Process the file one record at a time
with open(input_file, 'r', encoding='utf-8') as file, open(output_file, 'w', encoding='utf-8', newline='') as output:
    writer = csv.DictWriter(output, fieldnames=columns, delimiter=';')
    writer.writeheader()  # Write the header once

    current_record = []  # To hold the current record
    records_processed = 0  # Counter for processed records

    for line in tqdm(file, desc="Processing lines"):
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Check if the line starts with a valid ID
        if is_valid_id(line.split(';')[0]):
            # Process the previous record if it exists
            if current_record:
                full_record = ' '.join(current_record)  # Join all parts of the record
                fields = full_record.split(';')  # Split by semicolon

                # Map the fields to the DataFrame structure
                row = {
                    'id': fields[0].strip('"') if len(fields) > 0 else '',
                    'DATA': fields[1] if len(fields) > 1 else '',
                    'name_tu_type': fields[2] if len(fields) > 2 else '',
                    'date': fields[3] if len(fields) > 3 else '',
                    'name_ru_org': fields[4] if len(fields) > 4 else ''
                }
                writer.writerow(row)  # Write the row to the output file
                records_processed += 1

            # Reset for the new record
            current_record = [line]
        else:
            # Append non-ID lines to the current record
            current_record.append(line)

    # Process the last record after finishing the loop
    if current_record:
        full_record = ' '.join(current_record)
        fields = full_record.split(';')

        # Map the fields to the DataFrame structure
        row = {
            'id': fields[0].strip('"') if len(fields) > 0 else '',
            'DATA': fields[1] if len(fields) > 1 else '',
            'name_tu_type': fields[2] if len(fields) > 2 else '',
            'date': fields[3] if len(fields) > 3 else '',
            'name_ru_org': fields[4] if len(fields) > 4 else ''
        }
        writer.writerow(row)
        records_processed += 1

print(f"Processing complete. {records_processed} records saved to {output_file}.")


Processing lines: 15732667it [07:54, 33156.57it/s]


Processing complete. 3108676 records saved to ../classi_bank/final_output.csv.


In [10]:
che = pd.read_csv(output_file,nrows=100,on_bad_lines='skip')

In [11]:
che.head

<bound method NDFrame.head of                 id;DATA;name_tu_type;date;name_ru_org
0   id;"""DATA""";"""name_tu_type""";"""date_""";"...
1   23121902809485;Выписка с индивидуального пенси...
2   23121902809480;Заявление HIDDEN .  о предостав...
3   23121902809477;Просит дать разрешение на прове...
4   23121902809474;Заявление на выдачу подтвержден...
..                                                ...
95  23121902809149;На действия сотрудников Учрежде...
96  23121902809148;HIDDEN пәні мұғалімі HIDDEN БЛЖ...
97  23121902809146;Прошу вас предоставить квартиру...
98  23121902809143;пояснение по установке огражден...
99  23121902809141;Заявление о привлечении пластич...

[100 rows x 1 columns]>

In [9]:
import pandas as pd

# Placeholder for input and output file paths
input_file = "../classi_bank/sorted.csv"  # Replace with your input file path
output_file = "../classi_bank/final_output.csv"  # Replace with your desired output file path

# Read the file with Pandas
try:
    # Read the file into a DataFrame
    df = pd.read_csv(input_file, header=None,nrows = 100, encoding='utf-8', on_bad_lines='skip')

    # Split the first column into multiple columns using ';' as the separator
    df_split = df.iloc[:, 0].str.split(';', expand=True)

    # Use the first row as column headers (optional, depends on your data structure)
    df_split.columns = df_split.iloc[0]  # Set the first row as headers
    df_split = df_split[1:]  # Drop the header row from the data

    # Save the resulting DataFrame to the output CSV
    df_split.to_csv(output_file, index=False, sep=';', encoding='utf-8')
    print(f"Data successfully processed and saved to {output_file}.")

except Exception as e:
    print(f"Error reading or processing the file: {e}")


Data successfully processed and saved to ../classi_bank/final_output.csv.


In [11]:
df_split

Unnamed: 0,id,DATA,name_tu_type,date,name_ru_org
1,id,"""""""DATA""""""","""""""name_tu_type""""""","""""""date_""""""","""""""name_ru_org"""""""
2,23121902809485,Выписка с индивидуального пенсионного счета,Запрос,"""""""2023/12/19 08:06:20.490209000""""""","""""""Акционерное общество """"""""Единый накопительн..."
3,23121902809480,Заявление HIDDEN . о предоставлении информации,Заявление,"""""""2023/12/19 08:01:57.882288000""""""",Костанайский областной филиал АО «ЕНПФ»
4,23121902809477,Просит дать разрешение на проведение мероприят...,Заявление,"""""""2023/12/19 08:01:10.859904000""""""","""""""Государственное учреждение """"""""Аппарат аким..."
5,23121902809474,Заявление на выдачу подтверждения профессионал...,Заявление,"""""""2023/12/19 08:00:44.145921000""""""",Республиканское государственное учреждение «Мо...
...,...,...,...,...,...
95,23121902809150,HIDDEN үйдерінің құрылыс жұмыстарына қатысты,Заявление,"""""""2023/12/19 06:41:04.202629000""""""","""""""Государственное учреждение """"""""Аппарат аким..."
96,23121902809149,На действия сотрудников Учреждение № *,Жалоба,"""""""2023/12/19 06:40:55.606491000""""""","""""""республиканское государственное учреждение ..."
97,23121902809148,HIDDEN пәні мұғалімі HIDDEN БЛЖ - дан ** пайыз...,Запрос,"""""""2023/12/19 06:40:51.665512000""""""","""""""Государственное учреждение """"""""Министерство..."
98,23121902809146,Прошу вас предоставить квартиру в связи с подх...,Заявление,"""""""2023/12/19 06:40:45.700311000""""""","""""""Государственное учреждение """"""""Аппарат аким..."


In [14]:
import pandas as pd

In [34]:
import pandas as pd
from tqdm import tqdm

# Placeholder for input and output file paths
input_file = "../classi_bank/1sorted.csv"  # Replace with your input file path
output_file = "../classi_bank/zfinal_spread.csv"  # Replace with your desired output file path

# Chunk size for processing rows
chunk_size = 500

try:
    with open(input_file, 'r', encoding='utf-8') as file:
        header = None
        # total_lines = sum(1 for _ in fil)  # Count total lines for progress bar
        total_lines = 3108677
        file.seek(0)  # Reset file pointer to the beginning

        with tqdm(total=total_lines, desc="Processing rows") as pbar:
            for i, chunk in enumerate(pd.read_csv(file, header=None, encoding='utf-8', on_bad_lines='skip', chunksize=chunk_size)):
                # Split the first column into multiple columns using ';' as the separator
                df_split = chunk.iloc[:, 0].str.split(';', expand=True)

                # Use the first chunk to set column headers if not already set
                if header is None:
                    header = df_split.iloc[0]  # Save the header row
                    df_split.columns = header  # Set column names
                    df_split = df_split[1:]  # Remove header row from the first chunk
                    modew = 'w'  # Overwrite file for the first chunk
                else:
                    df_split.columns = header  # Use the previously set headers
                    modew = 'a'  # Append subsequent chunks

                # Save the resulting DataFrame to the output CSV
                df_split.to_csv(output_file, index=False,mode=modew)#, sep=';', encoding='utf-8', mode=modew, header=(mode == 'w'))
                # df_split.to_csv(output_file, index=False, sep=';', encoding='utf-8', mode=mode, header=(mode == 'w'), quoting=csv.QUOTE_NONNUMERIC)

                last = df_split
                # Drop variables to free up RAM
                df_split=df_split[0:0]

                pbar.update(len(chunk))  # Update progress bar

    print(f"Data successfully processed and saved to {output_file}.")

except Exception as e:
    print(f"Error reading or processing the file: {e}")


Processing rows:  55%|█████████████████████████████▉                        | 1723306/3108677 [04:46<03:50, 6016.45it/s]

Data successfully processed and saved to ../classi_bank/zfinal_spread.csv.





In [35]:
last

Unnamed: 0,id,DATA,name_tu_type,date,name_ru_org
1723000,21070200471932,земельный участок в частную собственность дл...,Заявление,"""""""2021/07/02 05:34:25.369208000""""""",Государственное учреждение «Аппарат акима горо...
1723001,21070200471930,"""""""HIDDEN . : по вопросу восстановления на ...",Заявление,"""""""2021/07/02 05:34:13.966703000""""""","""""""Государственное учреждение """"""""Аппарат аким..."
1723002,21070200471927,Добрый день ! Просим принять и рассмотреть пи...,Заявление,"""""""2021/07/02 05:34:10.019829000""""""","""""""Республиканское государственное учреждение ..."
1723003,21070200471926,"""""""HIDDEN ауылдық округіне қарасты """"""""HIDDEN""...",Заявление,"""""""2021/07/02 05:34:06.923330000""""""","""""""Государственное учреждение """"""""Аппарат аким..."
1723004,21070200471925,о неисполнении подрядчиком условий договора по...,Заявление,"""""""2021/07/02 05:33:49.498960000""""""","""""""Государственное учреждение """"""""Аппарат аким..."
...,...,...,...,...,...
1723301,21070100470985,\tЗапрос по HIDDENым (HIDDEN),Запрос,"""""""2021/07/01 16:26:00.700622000""""""",Управление документационного обеспечения и ана...
1723302,21070100470984,\t*** HIDDEN обл . г . HIDDENнда г . HIDDEN...,Запрос,"""""""2021/07/01 16:21:47.864126000""""""",Управление документационного обеспечения и ана...
1723303,21070100470983,\tПрошу назначите мне до родавой пасобия (HIDDEN),Заявление,"""""""2021/07/01 16:17:31.080508000""""""",Управление документационного обеспечения и ана...
1723304,21070100470980,\tо снятии ареста с имущества HIDDEN . (HIDDEN),Запрос,"""""""2021/07/01 16:14:27.697652000""""""",Управление документационного обеспечения и ана...


In [36]:
che=pd.read_csv(output_file,nrows=500)

In [37]:
che.info

<bound method DataFrame.info of                  id                                               DATA  \
0                id                                         """DATA"""   
1    23121902809485        Выписка с индивидуального пенсионного счета   
2    23121902809480    Заявление HIDDEN .  о предоставлении информации   
3    23121902809477  Просит дать разрешение на проведение мероприят...   
4    23121902809474  Заявление на выдачу подтверждения профессионал...   
..              ...                                                ...   
495  23121802807993  О принятии мер по факту предвзятого рассмотрен...   
496  23121802807990  О принятии мер по факту предвзятого рассмотрен...   
497  23121802807988  О принятии мер по обеспечению законности прове...   
498  23121802807978  завершение подключения к Интернету до ** декаб...   
499              id                                               DATA   

           name_tu_type                                 date  \
0    """name_tu

In [38]:
che.loc[1]

id                                                 23121902809485
DATA                  Выписка с индивидуального пенсионного счета
name_tu_type                                               Запрос
date                          """2023/12/19 08:06:20.490209000"""
name_ru_org     """Акционерное общество """"Единый накопительн...
Name: 1, dtype: object