In [7]:
import pandas as pd

# Load the data
df = pd.read_csv('valid_data.csv')

# Step 1: Remove duplicate 'index' values, keeping the first occurrence
df_unique = df.drop_duplicates(subset='index', keep='first')

# Sort by the 'index' column
df_unique = df_unique.sort_values(by='index')

# Get the minimum and maximum of the 'index' column
min_index = df_unique['index'].min()
max_index = df_unique['index'].max()
expected_indexes = range(min_index, max_index + 1)  # Create a range from min to max 'index'

# Step 2: Find the missing 'index' values
missing_indexes = pd.Index(expected_indexes).difference(df_unique['index'])

# Step 3: Create a DataFrame for missing 'index' values
# If you want to keep the structure of the original DataFrame, use NaN for the missing rows
df_missing = pd.DataFrame({'index': missing_indexes})

# Add NaN columns to match the structure of the original DataFrame (excluding the 'index' column)
for col in df.columns.difference(['index']):
    df_missing[col] = pd.NA

# Concatenate the unique and missing DataFrames
df_result = pd.concat([df_unique, df_missing], ignore_index=True)

# Sort the result DataFrame by the 'index' column again
df_result = df_result.sort_values(by='index')

# Output results
print("DataFrame with unique 'index' values and missing indexes filled:")
print(df_result)


DataFrame with unique 'index' values and missing indexes filled:
       index                                         image_link group_id  \
0      39398  https://m.media-amazon.com/images/I/51KPYVZtFh...   373107   
1      39399  https://m.media-amazon.com/images/I/51KPaJLHzj...   417434   
2      39400  https://m.media-amazon.com/images/I/51KPbIUn2B...   916768   
3      39401  https://m.media-amazon.com/images/I/51KPg7dtyO...   931856   
4      39402  https://m.media-amazon.com/images/I/51KPg7dtyO...   931856   
...      ...                                                ...      ...   
13095  52514  https://m.media-amazon.com/images/I/51YjQTRlr0...   296366   
13096  52515  https://m.media-amazon.com/images/I/51YjTBpT98...   436746   
13097  52516  https://m.media-amazon.com/images/I/51YjTBpT98...   436746   
13098  52517  https://m.media-amazon.com/images/I/51YjWArvhz...   667819   
13099  52518  https://m.media-amazon.com/images/I/51YjX5pP6v...   866516   

      entity_name     

In [8]:
df_result.to_csv('uniqueSorted.csv')

In [9]:
df_missing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   index               21 non-null     int64 
 1   entity_name         0 non-null      object
 2   extracted_data      0 non-null      object
 3   group_id            0 non-null      object
 4   image_link          0 non-null      object
 5   pytesseract_output  0 non-null      object
 6   valid_data          0 non-null      object
dtypes: int64(1), object(6)
memory usage: 1.3+ KB
