## Licensing Note

This project uses humanitarian datasets from [Tech for Palestine Data Hub](https://data.techforpalestine.org), licensed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/). Proper attribution has been maintained.


In [10]:
import pandas as pd
import numpy as np
import json

casualties = pd.read_csv('/kaggle/input/gaza-and-west-bank-humanitarian-data-202324/casualties_daily.csv')
west_bank = pd.read_csv('/kaggle/input/gaza-and-west-bank-humanitarian-data-202324/west_bank_daily.csv')
press_killed = pd.read_csv('/kaggle/input/gaza-and-west-bank-humanitarian-data-202324/press_killed_in_gaza.csv')

with open('/kaggle/input/gaza-and-west-bank-humanitarian-data-202324/infrastructure-damaged.json') as f:
    infra_json = json.load(f)

In [11]:
casualties.head()
west_bank.head()
press_killed.head()
infra_json

[{'report_date': '2023-10-07',
  'civic_buildings': {'ext_destroyed': 5},
  'educational_buildings': {'ext_destroyed': 1, 'ext_damaged': 15},
  'places_of_worship': {'ext_mosques_destroyed': 2,
   'ext_mosques_damaged': 4,
   'ext_churches_destroyed': 0},
  'residential': {'ext_destroyed': 80}},
 {'report_date': '2023-10-08',
  'civic_buildings': {'ext_destroyed': 11},
  'educational_buildings': {'ext_destroyed': 1, 'ext_damaged': 30},
  'places_of_worship': {'ext_mosques_destroyed': 4,
   'ext_mosques_damaged': 8,
   'ext_churches_destroyed': 0},
  'residential': {'destroyed': 159, 'ext_destroyed': 159}},
 {'report_date': '2023-10-09',
  'civic_buildings': {'ext_destroyed': 16},
  'educational_buildings': {'ext_destroyed': 2, 'ext_damaged': 45},
  'places_of_worship': {'ext_mosques_destroyed': 6,
   'ext_mosques_damaged': 12,
   'ext_churches_destroyed': 0},
  'residential': {'destroyed': 790, 'ext_destroyed': 790}},
 {'report_date': '2023-10-10',
  'civic_buildings': {'ext_destroyed'

## Data Cleaning

### Casualties Daily Cleaning

In [15]:
# 1. Check initial structure
casualties.info()
casualties.head()

# 2. Convert report_date to datetime
casualties['report_date'] = pd.to_datetime(casualties['report_date'], errors='coerce')

# 3. Handle missing values
# Fill missing 'killed', 'killed_cum', 'injured_cum' carefully
casualties['killed'] = casualties['killed'].fillna(0)
casualties['killed_cum'] = casualties['killed_cum'].ffill()  # Forward fill cumulative counts
casualties['injured_cum'] = casualties['injured_cum'].ffill()

# Optional for 'med_killed_cum' and 'press_killed_cum' if they exist
for col in ['med_killed_cum', 'press_killed_cum']:
    if col in casualties.columns:
        casualties[col] = casualties[col].fillna(0)

# 4. Standardise data types
casualties['killed'] = casualties['killed'].astype(int)
casualties['killed_cum'] = casualties['killed_cum'].astype(int)
casualties['injured_cum'] = casualties['injured_cum'].astype(int)

# 5. (Optional) Drop redundant internal counts
# casualties = casualties.drop(columns=['killed_internal', 'injured_internal'], errors='ignore')

# 6. Save cleaned file
casualties.to_csv('/kaggle/working/cleaned_casualties_daily.csv', index=False)

print("Casualties Daily Cleaning Completed and Saved!")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   report_date              554 non-null    datetime64[ns]
 1   report_source            554 non-null    object        
 2   report_period            554 non-null    int64         
 3   ext_massacres_cum        554 non-null    int64         
 4   killed                   554 non-null    int64         
 5   killed_cum               554 non-null    int64         
 6   ext_killed               554 non-null    int64         
 7   ext_killed_cum           554 non-null    int64         
 8   ext_killed_children_cum  554 non-null    int64         
 9   ext_killed_women_cum     554 non-null    int64         
 10  injured_cum              554 non-null    int64         
 11  ext_injured              554 non-null    int64         
 12  ext_injured_cum          554 non-nul

In [16]:
casualties.info()
casualties.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   report_date              554 non-null    datetime64[ns]
 1   report_source            554 non-null    object        
 2   report_period            554 non-null    int64         
 3   ext_massacres_cum        554 non-null    int64         
 4   killed                   554 non-null    int64         
 5   killed_cum               554 non-null    int64         
 6   ext_killed               554 non-null    int64         
 7   ext_killed_cum           554 non-null    int64         
 8   ext_killed_children_cum  554 non-null    int64         
 9   ext_killed_women_cum     554 non-null    int64         
 10  injured_cum              554 non-null    int64         
 11  ext_injured              554 non-null    int64         
 12  ext_injured_cum          554 non-nul

Unnamed: 0,report_date,report_source,report_period,ext_massacres_cum,killed,killed_cum,ext_killed,ext_killed_cum,ext_killed_children_cum,ext_killed_women_cum,injured_cum,ext_injured,ext_injured_cum,ext_civdef_killed_cum,med_killed_cum,ext_med_killed_cum,press_killed_cum,ext_press_killed_cum
0,2023-10-07,mohtel,24,0,232,232,232,232,0,0,1610,1610,1610,0,6.0,6,1.0,1
1,2023-10-08,mohtel,24,0,138,370,138,370,78,41,1788,178,1788,0,0.0,6,1.0,1
2,2023-10-09,mohtel,24,8,190,560,190,560,91,61,2271,483,2271,0,6.0,6,3.0,3
3,2023-10-10,mohtel,24,8,340,900,340,900,260,230,4000,1729,4000,0,0.0,6,7.0,7
4,2023-10-11,gmotel,24,23,200,1100,200,1100,398,230,5184,1184,5184,0,10.0,10,0.0,7


### West Bank Daily Cleaning

In [17]:
# 1. Check initial structure
west_bank.info()
west_bank.head()

# 2. Convert report_date to datetime
west_bank['report_date'] = pd.to_datetime(west_bank['report_date'], errors='coerce')

# 3. Handle missing values
# Find all columns starting with 'verified'
verified_cols = [col for col in west_bank.columns if col.startswith('verified')]

# Fill missing values in verified columns with 0
west_bank[verified_cols] = west_bank[verified_cols].fillna(0)

# 4. Standardise data types
west_bank[verified_cols] = west_bank[verified_cols].astype(int)

# 5. (Optional) Focus only on verified data
# If you want to drop internal counts (non-verified fields), you can uncomment this:
# non_verified_cols = [col for col in west_bank.columns if col.startswith('internal')]
# west_bank = west_bank.drop(columns=non_verified_cols, errors='ignore')

# 6. Save cleaned file
west_bank.to_csv('/kaggle/working/cleaned_west_bank_daily.csv', index=False)

print("West Bank Daily Cleaning Completed and Saved!")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   report_date                    554 non-null    object 
 1   verified.killed                403 non-null    float64
 2   verified.killed_cum            404 non-null    float64
 3   verified.injured               389 non-null    float64
 4   verified.injured_cum           391 non-null    float64
 5   verified.killed_children       403 non-null    float64
 6   verified.killed_children_cum   404 non-null    float64
 7   verified.injured_children      389 non-null    float64
 8   verified.injured_children_cum  391 non-null    float64
 9   killed_cum                     554 non-null    int64  
 10  killed_children_cum            554 non-null    int64  
 11  injured_cum                    554 non-null    int64  
 12  injured_children_cum           554 non-null    int

In [18]:
# Quick Verification

# 1. Check for any remaining missing values
missing_values = west_bank.isnull().sum()
print("Missing Values:\n", missing_values)

# 2. Check data types
print("\nData Types:\n", west_bank.dtypes)

# 3. Preview a few rows
west_bank.head()

# 4. Quick Date Range Check
print("\nDate Range:")
print(f"From {west_bank['report_date'].min()} to {west_bank['report_date'].max()}")


Missing Values:
 report_date                      0
verified.killed                  0
verified.killed_cum              0
verified.injured                 0
verified.injured_cum             0
verified.killed_children         0
verified.killed_children_cum     0
verified.injured_children        0
verified.injured_children_cum    0
killed_cum                       0
killed_children_cum              0
injured_cum                      0
injured_children_cum             0
settler_attacks_cum              0
flash_source                     0
dtype: int64

Data Types:
 report_date                      datetime64[ns]
verified.killed                           int64
verified.killed_cum                       int64
verified.injured                          int64
verified.injured_cum                      int64
verified.killed_children                  int64
verified.killed_children_cum              int64
verified.injured_children                 int64
verified.injured_children_cum             int64

### Press Killed Cleaning

In [19]:
# 1. Check initial structure
press_killed.info()
press_killed.head()

# 2. Handle missing values
# Fill missing 'notes' field with a standard placeholder
if 'notes' in press_killed.columns:
    press_killed['notes'] = press_killed['notes'].fillna('Details not available')

# 3. (Optional) Future analysis prep: Extract structured fields
# For now, we'll just clean — deeper extraction can be a bonus project

# 4. Save cleaned file
press_killed.to_csv('/kaggle/working/cleaned_press_killed_in_gaza.csv', index=False)

print("Press Killed Cleaning Completed and Saved!")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     209 non-null    object
 1   name_en  209 non-null    object
 2   notes    156 non-null    object
dtypes: object(3)
memory usage: 5.0+ KB
Press Killed Cleaning Completed and Saved!


In [20]:
# Quick Verification

# 1. Check for missing values
missing_values_press = press_killed.isnull().sum()
print("Missing Values:\n", missing_values_press)

# 2. Check data types
print("\nData Types:\n", press_killed.dtypes)

# 3. Preview a few rows
press_killed.head()

# 4. Quick Date Range Check (optional)
if 'date_of_death' in press_killed.columns:
    press_killed['date_of_death'] = pd.to_datetime(press_killed['date_of_death'], errors='coerce')
    print("\nDate Range:")
    print(f"From {press_killed['date_of_death'].min()} to {press_killed['date_of_death'].max()}")


Missing Values:
 name       0
name_en    0
notes      0
dtype: int64

Data Types:
 name       object
name_en    object
notes      object
dtype: object


### Infrastructure Damaged Cleaning

In [21]:
# 1. Flatten the JSON structure
infra_flat = pd.json_normalize(infra_json)

# 2. Standardise column names: lowercase + underscores
infra_flat.columns = infra_flat.columns.str.lower().str.replace(' ', '_').str.replace('.', '_')

# 3. Convert report_date to datetime
if 'report_date' in infra_flat.columns:
    infra_flat['report_date'] = pd.to_datetime(infra_flat['report_date'], errors='coerce')

# 4. (Optional) Quickly check structure
infra_flat.info()
infra_flat.head()

# 5. Save cleaned file
infra_flat.to_csv('/kaggle/working/cleaned_infrastructure_damaged.csv', index=False)

print("Infrastructure Damaged Cleaning Completed and Saved!")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534 entries, 0 to 533
Data columns (total 15 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   report_date                               534 non-null    datetime64[ns]
 1   civic_buildings_ext_destroyed             534 non-null    int64         
 2   educational_buildings_ext_destroyed       534 non-null    int64         
 3   educational_buildings_ext_damaged         534 non-null    int64         
 4   places_of_worship_ext_mosques_destroyed   534 non-null    int64         
 5   places_of_worship_ext_mosques_damaged     360 non-null    float64       
 6   places_of_worship_ext_churches_destroyed  534 non-null    int64         
 7   residential_ext_destroyed                 534 non-null    int64         
 8   residential_destroyed                     106 non-null    float64       
 9   civic_buildings_destroyed       

In [22]:
# Quick Verification

# 1. Check for missing values
missing_values_infra = infra_flat.isnull().sum()
print("Missing Values:\n", missing_values_infra)

# 2. Check data types
print("\nData Types:\n", infra_flat.dtypes)

# 3. Preview a few rows
infra_flat.head()

# 4. Quick Date Range Check
if 'report_date' in infra_flat.columns:
    print("\nDate Range:")
    print(f"From {infra_flat['report_date'].min()} to {infra_flat['report_date'].max()}")


Missing Values:
 report_date                                   0
civic_buildings_ext_destroyed                 0
educational_buildings_ext_destroyed           0
educational_buildings_ext_damaged             0
places_of_worship_ext_mosques_destroyed       0
places_of_worship_ext_mosques_damaged       174
places_of_worship_ext_churches_destroyed      0
residential_ext_destroyed                     0
residential_destroyed                       428
civic_buildings_destroyed                   451
educational_buildings_destroyed             445
educational_buildings_damaged               444
places_of_worship_mosques_destroyed         453
places_of_worship_mosques_damaged           464
places_of_worship_churches_destroyed        451
dtype: int64

Data Types:
 report_date                                 datetime64[ns]
civic_buildings_ext_destroyed                        int64
educational_buildings_ext_destroyed                  int64
educational_buildings_ext_damaged                    int64


*End of data cleaning process