In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read Teochat CSV
teochat_path = "/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/temporal-analysis/model_prediction_csv/teochat_kiln_results_test.csv"
df = pd.read_csv(teochat_path)

print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

Shape: (924, 10)

Columns: ['lat_lon', 'bbox', 'presence', 'shape_start', 'shape_end', 'demolished', 'appearance_year', 'shape_change_year', 'demolished_year', 'raw_response']


In [3]:
# Display first few rows
df.head(10)

Unnamed: 0,lat_lon,bbox,presence,shape_start,shape_end,demolished,appearance_year,shape_change_year,demolished_year,raw_response
0,28.205600_77.105800,none,False,Oval,Rectangular,True,2014,2022,2022,"{\n""bbox"": ""none"",\n""presence"": false,\n""appea..."
1,28.205600_77.164500,none,True,Oval,Rectangular,False,2014,2018,0,"{\n""bbox"": ""none"",\n""presence"": true,\n""appear..."
2,28.205600_77.244400,none,False,Oval,Rectangular,False,2014,2020,0,"{\n""bbox"": ""none"",\n""presence"": false,\n""appea..."
3,28.205600_77.340900,none,False,Oval,Rectangular,False,2014,2018,0,"{\n""bbox"": ""none"",\n""presence"": false,\n""appea..."
4,28.205600_77.344400,none,False,Oval,Rectangular,True,2014,2020,2022,"{\n""bbox"": ""none"",\n""presence"": false,\n""appea..."
5,28.205600_77.364600,none,False,Oval,Rectangular,False,2014,2020,0,"{\n""bbox"": ""none"",\n""presence"": false,\n""appea..."
6,28.205600_77.373400,none,False,Oval,Rectangular,False,2014,2016,0,"{\n""bbox"": ""none"",\n""presence"": false,\n""appea..."
7,28.205600_77.382200,none,False,Oval,Rectangular,True,2014,2020,2022,"{\n""bbox"": ""none"",\n""presence"": false,\n""appea..."
8,28.205600_77.482200,none,False,Oval,Rectangular,False,2014,2018,0,"{\n""bbox"": ""none"",\n""presence"": false,\n""appea..."
9,28.205600_77.514600,none,False,Oval,Rectangular,True,2014,2018,2022,"{\n""bbox"": ""none"",\n""presence"": false,\n""appea..."


In [4]:
# Display data types and info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924 entries, 0 to 923
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   lat_lon            924 non-null    object
 1   bbox               924 non-null    object
 2   presence           924 non-null    bool  
 3   shape_start        924 non-null    object
 4   shape_end          924 non-null    object
 5   demolished         924 non-null    bool  
 6   appearance_year    924 non-null    int64 
 7   shape_change_year  924 non-null    int64 
 8   demolished_year    924 non-null    int64 
 9   raw_response       923 non-null    object
dtypes: bool(2), int64(3), object(5)
memory usage: 59.7+ KB


In [5]:
# Display all columns with sample values
for col in df.columns:
    print(f"\n{col}:")
    print(f"  Type: {df[col].dtype}")
    print(f"  Non-null: {df[col].notna().sum()} / {len(df)}")
    print(f"  Sample values: {df[col].dropna().head(3).tolist()}")


lat_lon:
  Type: object
  Non-null: 924 / 924
  Sample values: ['28.205600_77.105800', '28.205600_77.164500', '28.205600_77.244400']

bbox:
  Type: object
  Non-null: 924 / 924
  Sample values: ['none', 'none', 'none']

presence:
  Type: bool
  Non-null: 924 / 924
  Sample values: [False, True, False]

shape_start:
  Type: object
  Non-null: 924 / 924
  Sample values: ['Oval', 'Oval', 'Oval']

shape_end:
  Type: object
  Non-null: 924 / 924
  Sample values: ['Rectangular', 'Rectangular', 'Rectangular']

demolished:
  Type: bool
  Non-null: 924 / 924
  Sample values: [True, False, False]

appearance_year:
  Type: int64
  Non-null: 924 / 924
  Sample values: [2014, 2014, 2014]

shape_change_year:
  Type: int64
  Non-null: 924 / 924
  Sample values: [2022, 2018, 2020]

demolished_year:
  Type: int64
  Non-null: 924 / 924
  Sample values: [2022, 0, 0]

raw_response:
  Type: object
  Non-null: 923 / 924
  Sample values: ['{\n"bbox": "none",\n"presence": false,\n"appearance_image_id": "Imag

In [6]:
import json

# Parse raw_response to extract appearance_image_id
def extract_appearance_image_id(raw_response):
    try:
        data = json.loads(raw_response)
        return data.get('appearance_image_id', 'none')
    except:
        return 'none'

df['appearance_image_id'] = df['raw_response'].apply(extract_appearance_image_id)

print("Appearance Image ID extracted")
print(f"\nSample values:")
print(df['appearance_image_id'].value_counts().sort_index())

Appearance Image ID extracted

Sample values:
appearance_image_id
Image 1    579
Image 2     46
Image 3    209
Image 4     36
Image 5     44
none        10
Name: count, dtype: int64


In [7]:
# Presence category-wise analysis
print("="*60)
print("PRESENCE ANALYSIS")
print("="*60)

presence_counts = df['presence'].value_counts()
print(f"\nPresence Distribution:")
print(presence_counts)
print(f"\nPercentage:")
print((presence_counts / len(df) * 100).round(2))

PRESENCE ANALYSIS

Presence Distribution:
presence
True     701
False    223
Name: count, dtype: int64

Percentage:
presence
True     75.87
False    24.13
Name: count, dtype: float64


In [8]:
# Appearance Image ID category-wise analysis
print("="*60)
print("APPEARANCE IMAGE ID ANALYSIS")
print("="*60)

appearance_counts = df['appearance_image_id'].value_counts().sort_index()
print(f"\nAppearance Image ID Distribution:")
print(appearance_counts)
print(f"\nPercentage:")
print((appearance_counts / len(df) * 100).round(2))

APPEARANCE IMAGE ID ANALYSIS

Appearance Image ID Distribution:
appearance_image_id
Image 1    579
Image 2     46
Image 3    209
Image 4     36
Image 5     44
none        10
Name: count, dtype: int64

Percentage:
appearance_image_id
Image 1    62.66
Image 2     4.98
Image 3    22.62
Image 4     3.90
Image 5     4.76
none        1.08
Name: count, dtype: float64


In [9]:
# Cross-tabulation: Presence vs Appearance Image ID
print("="*60)
print("CROSS-TABULATION: PRESENCE vs APPEARANCE IMAGE ID")
print("="*60)

crosstab = pd.crosstab(df['presence'], df['appearance_image_id'], margins=True)
print(crosstab)

CROSS-TABULATION: PRESENCE vs APPEARANCE IMAGE ID
appearance_image_id  Image 1  Image 2  Image 3  Image 4  Image 5  none  All
presence                                                                   
False                    205        3        4        0        1    10  223
True                     374       43      205       36       43     0  701
All                      579       46      209       36       44    10  924


In [10]:
# Appearance year category-wise analysis
print("="*60)
print("APPEARANCE YEAR ANALYSIS")
print("="*60)

appearance_year_counts = df['appearance_year'].value_counts().sort_index()
print(f"\nAppearance Year Distribution:")
print(appearance_year_counts)
print(f"\nPercentage:")
print((appearance_year_counts / len(df) * 100).round(2))

APPEARANCE YEAR ANALYSIS

Appearance Year Distribution:
appearance_year
0        10
2014    579
2016     46
2018    209
2020     36
2022     44
Name: count, dtype: int64

Percentage:
appearance_year
0        1.08
2014    62.66
2016     4.98
2018    22.62
2020     3.90
2022     4.76
Name: count, dtype: float64


In [11]:
# Map Image ID to Year
# Assuming: Image 1 = 2014, Image 2 = 2016, Image 3 = 2018, Image 4 = 2020, Image 5 = 2022, Image 6 = 2024
image_to_year = {
    'Image 1': 2014,
    'Image 2': 2016,
    'Image 3': 2018,
    'Image 4': 2020,
    'Image 5': 2022,
    'Image 6': 2024,
    'none': 0
}

df['appearance_year_from_image'] = df['appearance_image_id'].map(image_to_year)

print("="*60)
print("COMPARISON: APPEARANCE YEAR vs APPEARANCE IMAGE ID (as year)")
print("="*60)

comparison = pd.crosstab(df['appearance_year'], df['appearance_year_from_image'], margins=True)
print(comparison)

# Check mismatches
mismatches = df[df['appearance_year'] != df['appearance_year_from_image']]
print(f"\n\nMismatches: {len(mismatches)} out of {len(df)} ({len(mismatches)/len(df)*100:.2f}%)")

COMPARISON: APPEARANCE YEAR vs APPEARANCE IMAGE ID (as year)
appearance_year_from_image   0  2014  2016  2018  2020  2022  All
appearance_year                                                  
0                           10     0     0     0     0     0   10
2014                         0   579     0     0     0     0  579
2016                         0     0    46     0     0     0   46
2018                         0     0     0   209     0     0  209
2020                         0     0     0     0    36     0   36
2022                         0     0     0     0     0    44   44
All                         10   579    46   209    36    44  924


Mismatches: 0 out of 924 (0.00%)


In [12]:
# Appearance Year Count by Different Years
print("="*60)
print("APPEARANCE YEAR COUNT (DIFFERENT YEARS)")
print("="*60)

year_counts = df['appearance_year'].value_counts().sort_index()

print("\nYear-wise Count:")
for year, count in year_counts.items():
    if year == 0:
        print(f"  {year} (None):  {count:>4} samples  ({count/len(df)*100:>5.2f}%)")
    else:
        print(f"  {year}:        {count:>4} samples  ({count/len(df)*100:>5.2f}%)")

print(f"\nTotal:        {len(df):>4} samples")
print(f"\nUnique years: {df['appearance_year'].nunique()}")
print(f"Year range:   {df[df['appearance_year'] > 0]['appearance_year'].min()} - {df[df['appearance_year'] > 0]['appearance_year'].max()}")

APPEARANCE YEAR COUNT (DIFFERENT YEARS)

Year-wise Count:
  0 (None):    10 samples  ( 1.08%)
  2014:         579 samples  (62.66%)
  2016:          46 samples  ( 4.98%)
  2018:         209 samples  (22.62%)
  2020:          36 samples  ( 3.90%)
  2022:          44 samples  ( 4.76%)

Total:         924 samples

Unique years: 6
Year range:   2014 - 2022
