In [1]:
#Importar librerías y cargar base de datos

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("incident_event_log.csv")

In [2]:
#Reemplazar "?" por NaN
df=df.replace("?",np.nan)

# Total de filas
total_filas = len(df)

# Conteo de vacíos
faltantes = df.isna().sum()

# Porcentaje de vacíos
porcentaje_faltantes = (faltantes / total_filas) * 100

# DataFrame resumen
faltantes_df = pd.DataFrame({
    "Valores_faltantes": faltantes,
    "Porcentaje_faltantes": porcentaje_faltantes.round(2)
})

print(faltantes_df)

# --- Análisis de duplicados ---

# Número total de filas duplicadas
total_duplicados = df.duplicated().sum()
print(f"Total de filas duplicadas: {total_duplicados}")

# Mostrar las filas duplicadas (si existen)
if total_duplicados > 0:
    print("\n=== Filas duplicadas ===")
    display(df[df.duplicated(keep=False)])  # keep=False muestra todas las ocurrencias
else:
    print("\nNo se encontraron filas duplicadas.")

                         Valores_faltantes  Porcentaje_faltantes
number                                   0                  0.00
incident_state                           0                  0.00
active                                   0                  0.00
reassignment_count                       0                  0.00
reopen_count                             0                  0.00
sys_mod_count                            0                  0.00
made_sla                                 0                  0.00
caller_id                               29                  0.02
opened_by                             4835                  3.41
opened_at                                0                  0.00
sys_created_by                       53076                 37.45
sys_created_at                       53076                 37.45
sys_updated_by                           0                  0.00
sys_updated_at                           0                  0.00
contact_type             

In [3]:
# --- Calcular porcentaje de nulos por columna ---
porc_nulos = (df.isna().sum() / len(df)) * 100

print("=== Porcentaje de valores faltantes por columna ===")
print(porc_nulos.sort_values(ascending=False).round(2))

# --- Seleccionar columnas a eliminar (más del 5% de nulos) ---
cols_drop = porc_nulos[porc_nulos > 5].index.tolist()

print(f"\nColumnas a eliminar (>5% nulos): {cols_drop}")

# --- Eliminar columnas ---
df_limpio = df.drop(columns=cols_drop)

print(f"\nShape original: {df.shape}")
print(f"Shape después de limpieza: {df_limpio.shape}")


=== Porcentaje de valores faltantes por columna ===
caused_by                  99.98
vendor                     99.83
cmdb_ci                    99.69
rfc                        99.30
problem_id                 98.38
sys_created_by             37.45
sys_created_at             37.45
u_symptom                  23.26
assigned_to                19.40
assignment_group           10.03
opened_by                   3.41
resolved_at                 2.22
closed_code                 0.50
resolved_by                 0.16
subcategory                 0.08
category                    0.06
location                    0.05
caller_id                   0.02
incident_state              0.00
number                      0.00
reassignment_count          0.00
active                      0.00
contact_type                0.00
sys_updated_by              0.00
made_sla                    0.00
sys_mod_count               0.00
reopen_count                0.00
opened_at                   0.00
sys_updated_at          

In [6]:
# --- Convertir a datetime ---
df_limpio["opened_at"] = pd.to_datetime(df_limpio["opened_at"], errors="coerce")
df_limpio["resolved_at"] = pd.to_datetime(df_limpio["resolved_at"], errors="coerce")
df_limpio["closed_at"] = pd.to_datetime(df_limpio["closed_at"], errors="coerce")

# --- Crear métricas derivadas en días ---
df_limpio["resolution_time"] = (df_limpio["resolved_at"] - df_limpio["opened_at"]).dt.total_seconds() / 86400
df_limpio["closure_time"] = (df_limpio["closed_at"] - df_limpio["opened_at"]).dt.total_seconds() / 86400

# --- Estadísticas descriptivas ---
print("\n=== Estadísticas descriptivas de resolution_time ===")
print(df_limpio["resolution_time"].describe().round(2))

print("\n=== Estadísticas descriptivas de closure_time ===")
print(df_limpio["closure_time"].describe().round(2))

  df_limpio["opened_at"] = pd.to_datetime(df_limpio["opened_at"], errors="coerce")
  df_limpio["resolved_at"] = pd.to_datetime(df_limpio["resolved_at"], errors="coerce")



=== Estadísticas descriptivas de resolution_time ===
count    138571.00
mean         11.23
std          27.12
min           0.00
25%           0.17
50%           3.06
75%          10.92
max         336.26
Name: resolution_time, dtype: float64

=== Estadísticas descriptivas de closure_time ===
count    56316.00
mean        73.38
std        105.24
min       -289.96
25%        -22.65
50%         67.05
75%        162.27
max        555.63
Name: closure_time, dtype: float64


In [7]:
# Normalizar impact, urgency y priority a solo texto limpio
for col in ["impact", "urgency", "priority"]:
    if col in df_limpio.columns:
        # Convertir a string, quitar números + guiones, recortar espacios
        df_limpio[col] = (
            df_limpio[col]
            .astype(str)
            .str.replace(r"^\d+\s*-\s*", "", regex=True)  # quita "1 -", "2 -" etc.
            .str.strip()  # quita espacios extras
        )

# Ver valores únicos después de limpiar
for col in ["impact", "urgency", "priority"]:
    if col in df_limpio.columns:
        print(f"{col}: {df_limpio[col].unique()}")


impact: ['Medium' 'High' 'Low']
urgency: ['Medium' 'Low' 'High']
priority: ['Moderate' 'High' 'Low' 'Critical']


In [8]:
# --- 1) Marcar outliers en reassignment_count (>15) y contar cuántos ---
df_limpio["is_outlier_reassignment"] = df_limpio["reassignment_count"] > 15
df_limpio["is_outlier_reassignment"] = df_limpio["is_outlier_reassignment"].fillna(False).astype(bool)

eliminados_reassignment = int(df_limpio["is_outlier_reassignment"].sum())

# Eliminar registros con reassignment_count > 15
df_limpio = df_limpio.loc[~df_limpio["is_outlier_reassignment"]].copy()

# --- 2) Outliers en reopen_count por IQR (solo límite superior Q3 + 1.5*IQR) ---
serie = df_limpio["reopen_count"].dropna()
q1 = serie.quantile(0.25)
q3 = serie.quantile(0.75)
iqr = q3 - q1
upper = q3 + 1.5 * iqr

df_limpio["is_outlier_reopen"] = (df_limpio["reopen_count"] > upper)
df_limpio["is_outlier_reopen"] = df_limpio["is_outlier_reopen"].fillna(False).astype(bool)

# --- 3) Resumen de resultados ---
total_actual = len(df_limpio)
outliers_reopen_n = int(df_limpio["is_outlier_reopen"].sum())
outliers_reopen_pct = (outliers_reopen_n / total_actual * 100) if total_actual > 0 else 0.0

print("=== Resumen limpieza/outliers ===")
print(f"- Registros ELIMINADOS por reassignment_count > 15: {eliminados_reassignment}")
print(f"- Outliers en reopen_count (IQR, > Q3 + 1.5*IQR): {outliers_reopen_n} "
      f"({outliers_reopen_pct:.2f}% del DataFrame actualizado)")

# (Opcional) verificar tipos booleanos
print("\nTipos de las columnas bandera:")
print(df_limpio[["is_outlier_reassignment","is_outlier_reopen"]].dtypes)

=== Resumen limpieza/outliers ===
- Registros ELIMINADOS por reassignment_count > 15: 87
- Outliers en reopen_count (IQR, > Q3 + 1.5*IQR): 2256 (1.59% del DataFrame actualizado)

Tipos de las columnas bandera:
is_outlier_reassignment    bool
is_outlier_reopen          bool
dtype: object
