##

## Train different state and test single state with the same model
## Data fetch part usiing a constant ratio

In [28]:
import os
from collections import Counter
import pandas as pd
from os.path import join, exists, basename, dirname, splitext, expanduser
from shutil import copy2
from collections import Counter, defaultdict




In [29]:
state_name ="bihar"
# Directory containing the .txt files
directory = f"../processed_data/{state_name}/labels"

# Initialize a Counter to track the first letters
first_letter_counts = Counter()

# Iterate through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):  # Process only .txt files
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():  # Ignore empty lines
                    first_letter = line.strip()[0]  # Extract the first letter
                    first_letter_counts[first_letter] += 1

# Print the results
print("Unique first letters and their counts:", dict(first_letter_counts))
print("Number of unique first letters:", len(first_letter_counts))


Unique first letters and their counts: {'2': 5010, '1': 1831, '0': 48}
Number of unique first letters: 3


### fetch data using a constant ratio

```python

In [3]:
ratio=0.25
destination_directory = f"../data/region_performace/{state_name}_{ratio}/labels"
if not exists(destination_directory):
    os.makedirs(destination_directory)

class_counts = Counter()
class_lines = defaultdict(list)
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path=os.path.join(directory,filename)
        with open(file_path,'r') as file:
            for line in file:
                if line.strip():
                    class_label=int(line.split()[0])
                    class_counts[class_label]+=1
                    class_lines[class_label].append((filename,line.strip()))


# Calculate the first 25% for each class and copy files
for class_label_,lines in class_lines.items():
    num_to_copy =max(1,int(class_counts[class_label_]*ratio))
    print(f"Processing class {class_label}: Copying {num_to_copy} lines.")
    lines_to_copy = lines[:num_to_copy]
    for filename,line in lines_to_copy:
        src_path=os.path.join(directory,filename)
        dest_path=os.path.join(destination_directory,filename)
        if os.path.exists(dest_path):
            with open(dest_path,'a') as dest_file:
                dest_file.write(line+'\n')
        else:
            with open(dest_path,'w') as dest_file:
                dest_file.write(line+'\n')

print(f"Lines successfully processed and copied to: {destination_directory}")


    


Processing class 1: Copying 1252 lines.
Processing class 1: Copying 457 lines.
Processing class 1: Copying 12 lines.
Lines successfully processed and copied to: ../data/region_performace/bihar_0.25/labels


In [24]:

directory = f"../data/region_performace/{state_name}_{ratio}/labels"

# Initialize a Counter to track the first letters
first_letter_counts = Counter()

# Iterate through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):  # Process only .txt files
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():  # Ignore empty lines
                    first_letter = line.strip()[0]  # Extract the first letter
                    first_letter_counts[first_letter] += 1

# Print the results
print("Unique first letters and their counts:", dict(first_letter_counts))
print("Number of unique first letters:", len(first_letter_counts))

Unique first letters and their counts: {'2': 397, '1': 168}
Number of unique first letters: 2


In [25]:
import os
# Paths
label_dir = f"../data/region_performace/{state_name}_{ratio}/labels"
source_image_dir = f"/home/patel_zeel/kiln_compass_24/data/{state_name}/images"
destination_image_dir = f"../data/region_performace/{state_name}_{ratio}/images"

# Create destination directory if it doesn't exist
os.makedirs(destination_image_dir, exist_ok=True)

# Iterate over all .txt files in the label directory
for label_file in os.listdir(label_dir):
    if label_file.endswith(".txt"):
        # Extract the base filename (without extension)
        base_name = os.path.splitext(label_file)[0]
        # Construct the corresponding .tif filename
        tif_filename = f"{base_name}.tif"
        # Construct the source .tif path
        source_tif_path = os.path.join(source_image_dir, tif_filename)
        # Construct the destination symlink path
        destination_tif_path = os.path.join(destination_image_dir, tif_filename)

        # Check if the corresponding .tif exists in the source directory
        if os.path.exists(source_tif_path):
            # Create a symbolic link in the destination directory
            if not os.path.exists(destination_tif_path):  # Avoid overwriting existing symlinks
                os.symlink(source_tif_path, destination_tif_path)
                print(f"Linked {source_tif_path} -> {destination_tif_path}")
            else:
                print(f"Symlink already exists for {destination_tif_path}")
        else:
            print(f"File not found: {source_tif_path}")

print(f"All matching .tif files have been symlinked to {destination_image_dir}")


Linked /home/patel_zeel/kiln_compass_24/data/west_bengal/images/9839588_2672847.tif -> ../data/region_performace/west_bengal_0.25/images/9839588_2672847.tif
Linked /home/patel_zeel/kiln_compass_24/data/west_bengal/images/9685491_2725130.tif -> ../data/region_performace/west_bengal_0.25/images/9685491_2725130.tif
Linked /home/patel_zeel/kiln_compass_24/data/west_bengal/images/9795560_2934262.tif -> ../data/region_performace/west_bengal_0.25/images/9795560_2934262.tif
Linked /home/patel_zeel/kiln_compass_24/data/west_bengal/images/9696498_2711371.tif -> ../data/region_performace/west_bengal_0.25/images/9696498_2711371.tif
Linked /home/patel_zeel/kiln_compass_24/data/west_bengal/images/9850595_2733385.tif -> ../data/region_performace/west_bengal_0.25/images/9850595_2733385.tif
Linked /home/patel_zeel/kiln_compass_24/data/west_bengal/images/9677235_2642578.tif -> ../data/region_performace/west_bengal_0.25/images/9677235_2642578.tif
Linked /home/patel_zeel/kiln_compass_24/data/west_bengal/i

### fetch data using a constant class count
```python

In [30]:
# state_name = "bihar"
source_label_dir = f"../processed_data/{state_name}/labels"
destination_directory = f"../data/region_performace/{state_name}_same_class_count_10_120_1000/labels"
if not exists(destination_directory):
    os.makedirs(destination_directory)
    

class_fetch_counts={

    0:10,
    1:120,
    2:1000
}    
class_counts = Counter()
class_lines = defaultdict(list)

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path=os.path.join(directory,filename)
        with open(file_path,'r') as file:
            for line in file:
                if line.strip():
                    class_label=int(line.split()[0])
                    class_counts[class_label]+=1
                    class_lines[class_label].append((filename,line.strip()))

# display(class_lines)                    

In [31]:
for class_label_,lines in class_lines.items():
    num_to_copy=class_fetch_counts.get(class_label_,0)
    print(f"Processing class {class_label_}: Copying {num_to_copy} lines.")
    lines_to_copy = lines[:num_to_copy]
    for filename,line in lines_to_copy:
        src_path=os.path.join(directory,filename)
        dest_path=os.path.join(destination_directory,filename)
        if os.path.exists(dest_path):
            with open(dest_path,'a') as dest_file:
                dest_file.write(line+'\n')
        else:
            with open(dest_path,'w') as dest_file:
                dest_file.write(line+'\n')
print(f"Lines successfully processed and copied to: {destination_directory}")

Processing class 2: Copying 1000 lines.
Processing class 1: Copying 120 lines.
Processing class 0: Copying 10 lines.
Lines successfully processed and copied to: ../data/region_performace/bihar_same_class_count_10_120_1000/labels


In [32]:
directory = destination_directory
first_letter_counts = Counter()
for filename in os.listdir(directory):
    if filename.endswith(".txt"): 
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():  
                    first_letter = line.strip()[0] 
                    first_letter_counts[first_letter] += 1

print("Unique first letters and their counts:", dict(first_letter_counts))
print("Number of unique first letters:", len(first_letter_counts))

Unique first letters and their counts: {'2': 1000, '1': 120, '0': 10}
Number of unique first letters: 3


##### fetch the images

In [33]:
import os
# Paths
label_dir = f"../data/region_performace/{state_name}_same_class_count_10_120_1000/labels"
source_image_dir = f"/home/patel_zeel/kiln_compass_24/data/{state_name}/images"
destination_image_dir = f"../data/region_performace/{state_name}_same_class_count_10_120_1000/images"

# Create destination directory if it doesn't exist
os.makedirs(destination_image_dir, exist_ok=True)

# Iterate over all .txt files in the label directory
for label_file in os.listdir(label_dir):
    if label_file.endswith(".txt"):
        # Extract the base filename (without extension)
        base_name = os.path.splitext(label_file)[0]
        # Construct the corresponding .tif filename
        tif_filename = f"{base_name}.tif"
        # Construct the source .tif path
        source_tif_path = os.path.join(source_image_dir, tif_filename)
        # Construct the destination symlink path
        destination_tif_path = os.path.join(destination_image_dir, tif_filename)

        # Check if the corresponding .tif exists in the source directory
        if os.path.exists(source_tif_path):
            # Create a symbolic link in the destination directory
            if not os.path.exists(destination_tif_path):  # Avoid overwriting existing symlinks
                os.symlink(source_tif_path, destination_tif_path)
                print(f"Linked {source_tif_path} -> {destination_tif_path}")
            else:
                print(f"Symlink already exists for {destination_tif_path}")
        else:
            print(f"File not found: {source_tif_path}")

print(f"All matching .tif files have been symlinked to {destination_image_dir}")


Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9518858_2962391.tif -> ../data/region_performace/bihar_same_class_count_10_120_1000/images/9518858_2962391.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9386775_2855073.tif -> ../data/region_performace/bihar_same_class_count_10_120_1000/images/9386775_2855073.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9351002_2844066.tif -> ../data/region_performace/bihar_same_class_count_10_120_1000/images/9351002_2844066.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9631679_3061453.tif -> ../data/region_performace/bihar_same_class_count_10_120_1000/images/9631679_3061453.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9339995_2934873.tif -> ../data/region_performace/bihar_same_class_count_10_120_1000/images/9339995_2934873.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9480334_3072460.tif -> ../data/region_performace/bihar_same_class_count_10_120_1000/images/948033

### test the data fetch
```python

In [None]:


state_name = "bihar"
directory = f"../processed_data/bihar/labels"
destination_directory = f"../data/region_performance/test_{state_name}_same_class_count_10_120_1000/labels"

if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

# Define the number of lines to fetch for each class
class_fetch_counts = {
    0: 10,   # Number of lines to fetch for class 0
    1: 120,  # Number of lines to fetch for class 1
    2: 1000  # Number of lines to fetch for class 2
}

# Initialize offsets for each class to start after the first batch
class_offsets = {
    0: 11,    # Start after the first 10 lines for class 0
    1: 121,   # Start after the first 120 lines for class 1
    2: 1001   # Start after the first 1000 lines for class 2
}

class_lines = defaultdict(list)

# Read the .txt files and group lines by class
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():
                    class_label = int(line.split()[0])
                    class_lines[class_label].append((filename, line.strip()))

# Process each class and fetch the next set of lines starting from the offset
for class_label, lines in class_lines.items():
    offset = class_offsets[class_label]
    num_to_copy = class_fetch_counts.get(class_label, 0)
    
    print(f"Processing class {class_label}: Fetching {num_to_copy} lines starting from offset {offset}.")
    
    lines_to_copy = lines[offset:offset + num_to_copy]
    
    # Update the offset for the next fetch (if needed later)
    class_offsets[class_label] += num_to_copy
    
    # Copy the selected lines to the destination directory
    for filename, line in lines_to_copy:
        dest_path = os.path.join(destination_directory, filename)
        
        if os.path.exists(dest_path):
            with open(dest_path, 'a') as dest_file:
                dest_file.write(line + '\n')
        else:
            with open(dest_path, 'w') as dest_file:
                dest_file.write(line + '\n')

print(f"Next set of lines successfully processed and copied to: {destination_directory}")


Processing class 2: Fetching 1000 lines starting from offset 1001.
Processing class 1: Fetching 120 lines starting from offset 121.
Processing class 0: Fetching 10 lines starting from offset 11.
Next set of lines successfully processed and copied to: ../data/region_performance/test_bihar_same_class_count_10_120_1000/labels


In [34]:
state_name = "bihar"

directory = f"../processed_data/bihar/labels"
destination_directory = f"../data/region_performace/test_{state_name}_same_class_count_10_120_1000/labels"

if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

class_fetch_counts = {
    0: 10,   
    1: 120, 
    2: 1000  
}

class_offsets = defaultdict(int)
print(class_offsets)
class_lines = defaultdict(list)
print(class_lines)


defaultdict(<class 'int'>, {})
defaultdict(<class 'list'>, {})


In [35]:
# Read the .txt files and group lines by class
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            for line in file:
                if line.strip():
                    class_label = int(line.split()[0])
                    class_lines[class_label].append((filename, line.strip()))

#### Process each class based on the defined fetch counts and offsets


In [36]:



for class_label_, lines in class_lines.items():
    offset = class_offsets[class_label_]
    print(offset)
    num_to_copy = class_fetch_counts.get(class_label_, 0)  
    
    print(f"Processing class {class_label_}: Fetching {num_to_copy} lines starting from offset {offset}.")
    lines_to_copy = lines[offset:offset + num_to_copy]
    # print(lines_to_copy)
    
    # Update the offset for the next fetch
    class_offsets[class_label_] += num_to_copy
    # print(class_offsets)
    
    # Copy the selected lines to the destination directory
    for filename, line in lines_to_copy:
        dest_path = os.path.join(destination_directory, filename)
        
        if os.path.exists(dest_path):
            with open(dest_path, 'a') as dest_file:
                dest_file.write(line + '\n')
        else:
            with open(dest_path, 'w') as dest_file:
                dest_file.write(line + '\n')

print(f"Next set of lines successfully processed and copied to: {destination_directory}")


0
Processing class 2: Fetching 1000 lines starting from offset 0.
0
Processing class 1: Fetching 120 lines starting from offset 0.
0
Processing class 0: Fetching 10 lines starting from offset 0.
Next set of lines successfully processed and copied to: ../data/region_performace/test_bihar_same_class_count_10_120_1000/labels


In [45]:

previous_labels_directory = "../data/region_performance/bihar_same_class_count_10_120_1000/labels"
new_labels_directory = "../data/region_performance/test_bihar_same_class_count_10_120_1000/labels"

def read_labels(directory):
    """
    Reads all label files in a directory and returns a dictionary
    mapping filenames to sets of bounding box strings.
    """
    labels = {}
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                labels[filename] = set(line.strip() for line in file if line.strip())
                # print(labels[filename])
        # print(labels)           
    return labels

previous_labels = read_labels(previous_labels_directory)
new_labels = read_labels(new_labels_directory)

# Check for intersection between labels
is_disjoint = True
for filename, new_bboxes in new_labels.items():
    if filename in previous_labels:
        common_bboxes = previous_labels[filename] & new_bboxes  # Intersection of sets
        # print(common_bboxes)
        if common_bboxes:
            print(f"Overlap found in file {filename}:")
            print(common_bboxes)
            is_disjoint = False

if is_disjoint:
    print("The previous labels and new labels are disjoint.")
else:
    print("There is overlap between the previous and new labels.")


#count the number of overlaps
overlaps = 0
for filename, new_bboxes in new_labels.items():
    if filename in previous_labels:
        common_bboxes = previous_labels[filename] & new_bboxes  # Intersection of sets
        overlaps += len(common_bboxes)
print(f"Total number of overlapping bounding boxes: {overlaps}")            


The previous labels and new labels are disjoint.
Total number of overlapping bounding boxes: 0


In [48]:
state_name = "bihar"
label_dir = f"../data/region_performance/test_{state_name}_same_class_count_10_120_1000/labels"
source_image_dir = f"/home/patel_zeel/kiln_compass_24/data/{state_name}/images"
destination_image_dir = f"../data/region_performance/test_{state_name}_same_class_count_10_120_1000/images"

# Create destination directory if it doesn't exist
os.makedirs(destination_image_dir, exist_ok=True)

# Iterate over all .txt files in the label directory
for label_file in os.listdir(label_dir):
    if label_file.endswith(".txt"):
        # Extract the base filename (without extension)
        base_name = os.path.splitext(label_file)[0]
        # Construct the corresponding .tif filename
        tif_filename = f"{base_name}.tif"
        # Construct the source .tif path
        source_tif_path = os.path.join(source_image_dir, tif_filename)
        # Construct the destination symlink path
        destination_tif_path = os.path.join(destination_image_dir, tif_filename)

        # Check if the corresponding .tif exists in the source directory
        if os.path.exists(source_tif_path):
            # Create a symbolic link in the destination directory
            if not os.path.exists(destination_tif_path):  # Avoid overwriting existing symlinks
                os.symlink(source_tif_path, destination_tif_path)
                print(f"Linked {source_tif_path} -> {destination_tif_path}")
            else:
                print(f"Symlink already exists for {destination_tif_path}")
        else:
            print(f"File not found: {source_tif_path}")

print(f"All matching .tif files have been symlinked to {destination_image_dir}")


Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9730741_2910108.tif -> ../data/region_performance/test_bihar_same_class_count_10_120_1000/images/9730741_2910108.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9386775_2885342.tif -> ../data/region_performance/test_bihar_same_class_count_10_120_1000/images/9386775_2885342.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9417044_2811045.tif -> ../data/region_performance/test_bihar_same_class_count_10_120_1000/images/9417044_2811045.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9474830_3033936.tif -> ../data/region_performance/test_bihar_same_class_count_10_120_1000/images/9474830_3033936.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9306974_2877087.tif -> ../data/region_performance/test_bihar_same_class_count_10_120_1000/images/9306974_2877087.tif
Linked /home/patel_zeel/kiln_compass_24/data/bihar/images/9397782_2827556.tif -> ../data/region_performance/test_bihar_same_c