# Data Preprocessing
Extracted preprocessing steps from `project pipeline.ipynb`.
Ensure your GTD Excel file is uploaded to the path shown in the `base_path` cell before running.

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost sqlalchemy plotly joblib -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sqlite3
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
import os
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
print("Setup complete")

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# NOTE: On non-Colab environments, remove or adapt the above drive.mount call and ensure `base_path` points to a local location.

In [None]:
base_path = '/content/drive/MyDrive/LABS/Project-1-GTD-Analytics'
os.makedirs(f'{base_path}/data', exist_ok=True)
os.makedirs(f'{base_path}/dashboards', exist_ok=True)
os.makedirs(f'{base_path}/sql', exist_ok=True)

print(f"Ready at: {base_path}")
print("Upload the GTD .xlsx to: /content/drive/MyDrive/LABS/Project-1-GTD-Analytics/data/gtd_dataset.xlsx")

In [None]:
# Dataset
gtd_path = f'{base_path}/data/Copy of GTD'
df = pd.read_excel(gtd_path)
print(f"Loaded {df.shape[0]:,} attacks Ã— {df.shape[1]} columns")
print("
Key columns found:")
print(df.columns.tolist()[:20])
print("
Sample data:")
df[['iyear', 'imonth', 'iday', 'country_txt', 'region_txt', 'attacktype1_txt',
    'nkill', 'nwound']].head()

In [None]:
print("MASTER ETL PIPELINE")

# Start with raw data
df_etl = df.copy()

# STEP 1: Select columns
cols = ['eventid', 'iyear', 'imonth', 'iday', 'country_txt', 'region_txt',
        'city', 'latitude', 'longitude', 'attacktype1_txt', 'targtype1_txt',
        'weaptype1_txt', 'nkill', 'nwound', 'success']
available_cols = [col for col in cols if col in df.columns]
df_etl = df_etl[available_cols].copy()

# STEP 2: Convert ALL to SQLite-safe types FIRST
for col in ['iyear', 'imonth', 'iday', 'nkill', 'nwound', 'success']:
    if col in df_etl.columns:
        df_etl[col] = pd.to_numeric(df_etl[col], errors='coerce').fillna(0).astype(int)

for col in ['latitude', 'longitude']:
    if col in df_etl.columns:
        df_etl[col] = pd.to_numeric(df_etl[col], errors='coerce').fillna(0)

if 'city' in df_etl.columns:
    df_etl['city'] = df_etl['city'].fillna('Unknown')

# STEP 3: Create SQLite-safe date strings (NO Period objects)
df_etl['year_month'] = df_etl['iyear'].astype(str) + '-' + df_etl['imonth'].astype(str).str.zfill(2)
df_etl['date_str'] = (df_etl['iyear'].astype(str) + '-' +
                     df_etl['imonth'].astype(str).str.zfill(2) + '-' +
                     df_etl['iday'].astype(str).str.zfill(2))

# STEP 4: Features (all numeric/string)
df_etl['casualties'] = df_etl['nkill'] + df_etl['nwound']
df_etl['fatality_rate'] = df_etl['nkill'] / (df_etl['casualties'] + 1)

# STEP 5: Severity (string only)
df_etl['severity'] = pd.cut(df_etl['casualties'],
                            bins=[0, 1, 10, 50, np.inf],
                            labels=['Low', 'Medium', 'High', 'Extreme']).astype(str)

# SAVE CLEAN VERSION for charts/ML
df_clean = df_etl.copy()
df_clean.to_csv(f'{base_path}/data/gtd_cleaned.csv', index=False)

print(f"SUCCESS! ETL complete:")
print(f"- {len(df_etl):,} rows processed")
print(f"- df_clean saved for charts/ML")

In [None]:
# Quick check: load the saved cleaned CSV
df_clean = pd.read_csv(f'{base_path}/data/gtd_cleaned.csv')
print('Cleaned data loaded:', df_clean.shape)
df_clean.head()