In [25]:
import pandas as pd
import numpy as np

# Set display options (optional, just for nicer viewing)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 120)

# ---------- FILE PATHS (EDIT THESE) ----------
GTD_PATH = "data/gtd_raw.csv"                 # full original GTD
WB_POP_PATH = "data/worldbank_population.csv"
WB_GDP_PATH = "data/worldbank_gdp.csv"

OUTPUT_GTD_CLEAN = "output/gtd_cleaned_main.csv"
OUTPUT_DICT = "output/data_dictionary.csv"


In [26]:
import pandas as pd

# Load GTD – sometimes it’s tab-delimited or ISO-8859-1
gtd = pd.read_csv('/content/globalterrorismdb_0718dist.csv', encoding='ISO-8859-1', engine= 'python', on_bad_lines= 'skip')

print("GTD shape:", gtd.shape)
print("GTD columns:", len(gtd.columns))
print(gtd[['iyear']].agg(['min', 'max']))

# Missing values summary (for screenshot)
gtd_missing = gtd.isna().mean().sort_values(ascending=False) * 100
print(gtd_missing.head(30))



GTD shape: (181691, 135)
GTD columns: 135
     iyear
min   1970
max   2017
gsubname3           99.988992
weapsubtype4        99.961473
weapsubtype4_txt    99.961473
weaptype4_txt       99.959822
weaptype4           99.959822
claimmode3          99.926799
claimmode3_txt      99.926799
gsubname2           99.911938
claim3              99.824978
guncertain3         99.823877
divert              99.821675
gname3              99.821675
attacktype3         99.764435
attacktype3_txt     99.764435
ransomnote          99.718203
ransompaidus        99.696187
ransomamtus         99.690133
claimmode2_txt      99.660963
claimmode2          99.660963
ransompaid          99.574002
corp3               99.435305
targsubtype3_txt    99.396228
targsubtype3        99.396228
natlty3_txt         99.368708
natlty3             99.368708
target3             99.353298
targtype3           99.352747
targtype3_txt       99.352747
ransomamt           99.256980
weapsubtype3_txt    99.068198
dtype: float64


In [27]:
# World Bank Population
wb_pop = pd.read_csv('/content/API_SP.POP.TOTL_DS2_en_csv_v2_246068.csv', skiprows=4, encoding= 'utf-8')
print("WB pop shape:", wb_pop.shape)
print(wb_pop.head())

# Missing values summary
wb_pop_missing = wb_pop.isna().mean() * 100
print(wb_pop_missing)

WB pop shape: (266, 70)
                  Country Name Country Code     Indicator Name Indicator Code         1960         1961         1962  \
0                        Aruba          ABW  Population, total    SP.POP.TOTL      54922.0      55578.0      56320.0   
1  Africa Eastern and Southern          AFE  Population, total    SP.POP.TOTL  130075728.0  133534923.0  137171659.0   
2                  Afghanistan          AFG  Population, total    SP.POP.TOTL    9035043.0    9214083.0    9404406.0   
3   Africa Western and Central          AFW  Population, total    SP.POP.TOTL   97630925.0   99706674.0  101854756.0   
4                       Angola          AGO  Population, total    SP.POP.TOTL    5231654.0    5301583.0    5354310.0   

          1963         1964         1965         1966         1967         1968         1969         1970  \
0      57002.0      57619.0      58190.0      58694.0      58990.0      59069.0      59052.0      58950.0   
1  140945536.0  144904094.0  14903347

In [28]:
# World Bank GDP
wb_gdp = pd.read_csv('/content/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_267552.csv' , skiprows=4, encoding= 'utf-8')
print("WB gdp shape:", wb_gdp.shape)

wb_gdp_missing = wb_gdp.isna().mean() * 100
print(wb_gdp_missing)

WB gdp shape: (266, 70)
Country Name        0.000000
Country Code        0.000000
Indicator Name      0.000000
Indicator Code      0.000000
1960               43.233083
                     ...    
2021                3.383459
2022                3.759398
2023                6.766917
2024               13.157895
Unnamed: 69       100.000000
Length: 70, dtype: float64


In [29]:
# Keep original for documentation if you want
gtd_raw_rows = len(gtd)

# Filter for the actual available data range (1970-1991) based on the current dataset.
# If you intend to work with data from 2000-2020, you will need to load a different GTD dataset.
gtd = gtd[(gtd['iyear'] >= 1970) & (gtd['iyear'] <= 1991)].copy()
gtd_filtered_rows = len(gtd)

print(f"Started with {gtd_raw_rows} rows, after filtering 1970–1991: {gtd_filtered_rows} rows")

Started with 181691 rows, after filtering 1970–1991: 49644 rows


In [30]:
cols_keep = [
    'eventid',
    'iyear', 'imonth', 'iday',
    'country_txt', 'region_txt', 'city', 'latitude', 'longitude',
    'attacktype1_txt', 'targtype1_txt', 'weaptype1_txt',
    'gname',
    'nkill', 'nwound', 'success', 'property', 'propextent_txt',
    'summary'  # optional
]

gtd = gtd[cols_keep].copy()
print("Remaining columns:", gtd.columns.tolist())


Remaining columns: ['eventid', 'iyear', 'imonth', 'iday', 'country_txt', 'region_txt', 'city', 'latitude', 'longitude', 'attacktype1_txt', 'targtype1_txt', 'weaptype1_txt', 'gname', 'nkill', 'nwound', 'success', 'property', 'propextent_txt', 'summary']


In [31]:
# Casualties: nkill, nwound
for col in ['nkill', 'nwound']:
    # Replace -99 with NaN first
    gtd[col] = gtd[col].replace(-99, np.nan)
    # Create clean columns
    clean_col = col + '_clean'
    gtd[clean_col] = gtd[col].fillna(0)

# Document how many were missing
nkill_missing_count = gtd['nkill'].isna().sum()
nwound_missing_count = gtd['nwound'].isna().sum()
print("nkill missing originally:", nkill_missing_count)
print("nwound missing originally:", nwound_missing_count)

# Country: drop rows with missing country_txt
before_country = len(gtd)
gtd = gtd[~gtd['country_txt'].isna()].copy()
after_country = len(gtd)
print(f"Rows dropped due to missing country_txt: {before_country - after_country}")

# Attack / target / weapon: fill blanks with "Unknown"
cat_cols_unknown = ['attacktype1_txt', 'targtype1_txt', 'weaptype1_txt']
for col in cat_cols_unknown:
    gtd[col] = gtd[col].fillna("Unknown")

# Perpetrator group
gtd['gname'] = gtd['gname'].fillna("Unknown")

# Coordinates: leave as is (no filling)
# city: you may choose to fill or leave as is; usually leave.


nkill missing originally: 5793
nwound missing originally: 7613
Rows dropped due to missing country_txt: 0


In [32]:
# Total casualties (temporarily using clean columns)
gtd['total_casualties'] = gtd['nkill_clean'] + gtd['nwound_clean']

high_casualties = gtd[gtd['total_casualties'] > 1000].copy()
print("Incidents with >1000 casualties:", len(high_casualties))
high_casualties[['eventid', 'iyear', 'country_txt', 'city', 'total_casualties']].sort_values('total_casualties', ascending=False).head(20)


Incidents with >1000 casualties: 0


Unnamed: 0,eventid,iyear,country_txt,city,total_casualties


In [33]:
country_mapping = {
    "Russia": "Russian Federation",
    "South Korea": "Korea, Rep.",
    "North Korea": "Korea, Dem. People's Rep.",
    "Congo (Kinshasa)": "Congo, Dem. Rep.",
    "Congo (Brazzaville)": "Congo, Rep.",
    "Egypt": "Egypt, Arab Rep.",
    # add more as you discover mismatches
}

gtd['country_txt'] = gtd['country_txt'].replace(country_mapping)


In [34]:
# if day is 0 (unknown), use 1
gtd['iday'] = gtd['iday'].replace(0, 1)
gtd['imonth'] = gtd['imonth'].replace(0, 1)  # sometimes month = 0; you can also default to Jan

gtd['date'] = pd.to_datetime(dict(
    year=gtd['iyear'],
    month=gtd['imonth'],
    day=gtd['iday']
), errors='coerce')  # coerce in case of weird records

print(gtd[['iyear', 'imonth', 'iday', 'date']].head())


   iyear  imonth  iday       date
0   1970       7     2 1970-07-02
1   1970       1     1 1970-01-01
2   1970       1     1 1970-01-01
3   1970       1     1 1970-01-01
4   1970       1     1 1970-01-01


In [35]:
value_counts = gtd['weaptype1_txt'].value_counts()
rare_weap = value_counts[value_counts < 50].index

gtd['weaptype1_simplified'] = gtd['weaptype1_txt'].replace(
    {cat: "Other" for cat in rare_weap}
)


In [36]:
# 1) Casualty calculations
gtd['total_casualties'] = gtd['nkill_clean'] + gtd['nwound_clean']

def categorize(x):
    if x == 0:
        return "0"
    elif 1 <= x <= 50:
        return "1-50"
    elif 51 <= x <= 100:
        return "51-100"
    else:
        return ">100"

gtd['casualties_category'] = gtd['total_casualties'].apply(categorize)
gtd['fatalities_category'] = gtd['nkill_clean'].apply(categorize)
gtd['injuries_category'] = gtd['nwound_clean'].apply(categorize)

# 2) Temporal features
gtd['year'] = gtd['iyear']
gtd['month'] = gtd['imonth']

# Quarter
gtd['quarter'] = pd.PeriodIndex(gtd['date'], freq='Q').astype(str)

# Season mapping (simple Northern hemisphere assumption)
def season_from_month(m):
    if m in [12, 1, 2]:
        return "Winter"
    elif m in [3, 4, 5]:
        return "Spring"
    elif m in [6, 7, 8]:
        return "Summer"
    else:
        return "Fall"

gtd['season'] = gtd['imonth'].apply(season_from_month)

# 3) Target sector simplification
def map_target_sector(t):
    if pd.isna(t):
        return "Unknown"
    if t == "Private Citizens & Property":
        return "Civilian"
    if t == "Military":
        return "Military"
    if t == "Police":
        return "Security Forces"
    if t.startswith("Government"):
        return "Government"
    if t == "Business":
        return "Business"
    if t in ["Airports & Aircraft", "Transportation"]:
        return "Transportation"
    if t == "Educational Institution":
        return "Education"
    if "Religious" in t:
        return "Religious"
    # add more rules as needed
    return "Other"

gtd['target_sector'] = gtd['targtype1_txt'].apply(map_target_sector)

# 4) Success indicator
gtd['is_successful'] = gtd['success'].fillna(0).astype(int)


In [37]:
# Example: World Bank pop file columns:
# ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', ..., '2020']

year_cols = [str(y) for y in range(2000, 2021)]

wb_pop_long = wb_pop.melt(
    id_vars=['Country Name'],
    value_vars=year_cols,
    var_name='iyear',
    value_name='population'
)

# Convert year to int
wb_pop_long['iyear'] = wb_pop_long['iyear'].astype(int)

# Rename columns
wb_pop_long = wb_pop_long.rename(columns={'Country Name': 'country_txt'})

# Filter 2000–2020 just to be safe
wb_pop_long = wb_pop_long[(wb_pop_long['iyear'] >= 2000) & (wb_pop_long['iyear'] <= 2020)]

# Align country names to GTD standard
wb_pop_long['country_txt'] = wb_pop_long['country_txt'].replace(country_mapping)


In [38]:
year_cols = [str(y) for y in range(2000, 2021)]

# Assume GDP file has GDP total; if GDP per capita is separate, you may load it separately and merge.
wb_gdp_long = wb_gdp.melt(
    id_vars=['Country Name'],
    value_vars=year_cols,
    var_name='iyear',
    value_name='gdp'
)

wb_gdp_long['iyear'] = wb_gdp_long['iyear'].astype(int)
wb_gdp_long = wb_gdp_long.rename(columns={'Country Name': 'country_txt'})
wb_gdp_long['country_txt'] = wb_gdp_long['country_txt'].replace(country_mapping)

# If you have a GNI per capita or GDP per capita file, repeat + rename to gdp_per_capita


In [39]:
wb = pd.merge(
    wb_pop_long,
    wb_gdp_long,
    on=['country_txt', 'iyear'],
    how='outer'
)

print("World Bank combined shape:", wb.shape)


World Bank combined shape: (5586, 4)


In [40]:
merged = pd.merge(
    gtd,
    wb,
    on=['country_txt', 'iyear'],
    how='left'
)

print("Merged shape:", merged.shape)
print("Should equal cleaned GTD rows:", len(gtd))


Merged shape: (49644, 35)
Should equal cleaned GTD rows: 49644


In [41]:
# Avoid division by zero / missing
merged['population'] = merged['population'].replace(0, np.nan)

# attacks_per_million – each row is one attack; this metric is mostly for later aggregation.
merged['attacks_per_million'] = 1_000_000 / merged['population']

merged['casualties_per_million'] = merged['total_casualties'] * 1_000_000 / merged['population']
merged['fatalities_per_million'] = merged['nkill_clean'] * 1_000_000 / merged['population']


In [42]:
# No negative casualties
print((merged['nkill_clean'] < 0).sum(), (merged['nwound_clean'] < 0).sum())

# Date range check
print(merged['iyear'].min(), merged['iyear'].max())

# Duplicate eventid
dup_eventids = merged['eventid'].duplicated().sum()
print("Duplicate eventid count:", dup_eventids)

# Critical columns not missing
critical_cols = ['eventid', 'country_txt', 'iyear', 'attacktype1_txt']
print(merged[critical_cols].isna().sum())

# Countries missing economic data
missing_econ_countries = merged[merged['population'].isna()]['country_txt'].unique()
print("Countries with missing World Bank data:", missing_econ_countries)


0 0
1970 1991
Duplicate eventid count: 0
eventid            0
country_txt        0
iyear              0
attacktype1_txt    0
dtype: int64
Countries with missing World Bank data: ['Dominican Republic' 'Mexico' 'Philippines' 'Greece' 'Japan'
 'United States' 'Uruguay' 'Italy' 'East Germany (GDR)' 'Ethiopia'
 'Guatemala' 'Venezuela' 'West Germany (FRG)' 'Switzerland' 'Jordan'
 'Spain' 'Brazil' 'Egypt, Arab Rep.' 'Argentina' 'Lebanon' 'Ireland'
 'Turkey' 'Paraguay' 'Iran' 'United Kingdom' 'Colombia' 'Bolivia'
 'Nicaragua' 'Netherlands' 'Belgium' 'Canada' 'Australia' 'Pakistan'
 'Zambia' 'Sweden' 'Costa Rica' 'South Yemen' 'Cambodia' 'Israel' 'Poland'
 'Taiwan' 'Panama' 'Kuwait' 'West Bank and Gaza Strip' 'Austria'
 'Czechoslovakia' 'India' 'France' 'South Vietnam' 'Brunei' 'Zaire'
 "People's Republic of the Congo" 'Portugal' 'Algeria' 'El Salvador'
 'Thailand' 'Haiti' 'Sudan' 'Morocco' 'Cyprus' 'Myanmar' 'Afghanistan'
 'Peru' 'Chile' 'Honduras' 'Yugoslavia' 'Ecuador' 'New Zealand' 'Malaysi

In [43]:
total_incidents = len(merged)
total_casualties_all = merged['total_casualties'].sum()
date_range = (merged['date'].min(), merged['date'].max())
num_countries = merged['country_txt'].nunique()
num_regions = merged['region_txt'].nunique()

most_common_attack = merged['attacktype1_txt'].value_counts().idxmax()
most_common_target = merged['targtype1_txt'].value_counts().idxmax()

avg_casualties_per_incident = merged['total_casualties'].mean()

summary_stats = {
    "total_incidents": total_incidents,
    "total_casualties": int(total_casualties_all),
    "date_range": date_range,
    "num_countries": num_countries,
    "num_regions": num_regions,
    "most_common_attack_type": most_common_attack,
    "most_common_target_type": most_common_target,
    "avg_casualties_per_incident": avg_casualties_per_incident
}

summary_stats


{'total_incidents': 49644,
 'total_casualties': 161557,
 'date_range': (Timestamp('1970-01-01 00:00:00'),
  Timestamp('1991-12-31 00:00:00')),
 'num_countries': 164,
 'num_regions': 12,
 'most_common_attack_type': 'Bombing/Explosion',
 'most_common_target_type': 'Business',
 'avg_casualties_per_incident': np.float64(3.254310692127951)}

In [47]:
import os

final_cols = [
    'eventid', 'iyear', 'imonth', 'date',
    'country_txt', 'region_txt', 'city', 'latitude', 'longitude',
    'attacktype1_txt', 'targtype1_txt', 'weaptype1_txt',
    'target_sector',
    'gname',
    'nkill_clean', 'nwound_clean', 'total_casualties',
    'casualties_category', 'fatalities_category', 'injuries_category',
    'is_successful',
    'year', 'month', 'quarter', 'season',
    'population', 'gdp',
    'attacks_per_million', 'casualties_per_million', 'fatalities_per_million'
]

gtd_cleaned_main = merged[final_cols].copy()

# Create the output directory if it doesn't exist
os.makedirs(os.path.dirname(OUTPUT_GTD_CLEAN), exist_ok=True)
gtd_cleaned_main.to_csv(OUTPUT_GTD_CLEAN, index=False)
print("Saved:", OUTPUT_GTD_CLEAN)


Saved: output/gtd_cleaned_main.csv


In [48]:
data_dict_rows = []

for col in gtd_cleaned_main.columns:
    dtype = str(gtd_cleaned_main[col].dtype)

    # Guess data type category
    if "int" in dtype or "float" in dtype:
        dtype_cat = "number"
    elif "datetime" in dtype:
        dtype_cat = "date"
    else:
        dtype_cat = "text/category"

    if col in ['population', 'gdp', 'gdp_per_capita']:
        source = "World Bank"
    elif col in ['attacks_per_million', 'casualties_per_million', 'fatalities_per_million',
                 'total_casualties', 'casualties_category', 'fatalities_category', 'injuries_category',
                 'target_sector', 'is_successful', 'year', 'month', 'quarter', 'season']:
        source = "Derived"
    else:
        source = "GTD"

    example_val = gtd_cleaned_main[col].dropna().iloc[0] if gtd_cleaned_main[col].notna().any() else ""

    data_dict_rows.append({
        "column_name": col,
        "data_type": dtype_cat,
        "description": "",   # fill manually in Excel / later
        "source": source,
        "example_values": example_val,
        "missing_value_handling": ""  # fill manually based on your cleaning decisions
    })

data_dictionary = pd.DataFrame(data_dict_rows)
data_dictionary.to_csv(OUTPUT_DICT, index=False)
print("Saved:", OUTPUT_DICT)


Saved: output/data_dictionary.csv
