## Concrete Crack Images for Classification

---

#### Modelo Machine Learning

In [4]:
# Cargamos las librerias necesarias
import os
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from skimage.feature import hog
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from google.colab import files

In [2]:
# CONFIGURACIÓN Y DESCARGA DE DATOS
files.upload()
# Mover credenciales (ajusta si es necesario)
if not os.path.exists('/root/.kaggle'):
    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json

# Descargar Dataset: Concrete Crack Images for Classification
# ID del dataset: arunrk7/surface-crack-detection (versión limpia del original de Ozgenel)
print("Descargando dataset...")
!kaggle datasets download -d arunrk7/surface-crack-detection

# Descomprimir
print("Descomprimiendo...")
with zipfile.ZipFile('surface-crack-detection.zip', 'r') as zip_ref:
    zip_ref.extractall('./data')

print("Descarga completada")

Saving kaggle.json to kaggle.json
Descargando dataset...
Dataset URL: https://www.kaggle.com/datasets/arunrk7/surface-crack-detection
License(s): copyright-authors
Downloading surface-crack-detection.zip to /content
 58% 136M/233M [00:00<00:00, 1.42GB/s]
100% 233M/233M [00:00<00:00, 737MB/s] 
Descomprimiendo...
Descarga completada


In [5]:
# CARGA Y CREACIÓN DEL DATAFRAME

# Definir rutas
base_dir = Path('./data')
positive_dir = base_dir / 'Positive' # Grietas
negative_dir = base_dir / 'Negative' # Sin Grietas

# Generar lista de archivos
# Buscamos todas las imágenes .jpg
positive_images = list(positive_dir.glob('*.jpg'))
negative_images = list(negative_dir.glob('*.jpg'))

# Crear DataFrame
# Label 1 = Grieta (Positive), Label 0 = Sin Grieta (Negative)
images_df = pd.DataFrame({
    'filepath': [str(x) for x in positive_images] + [str(x) for x in negative_images],
    'label': [1] * len(positive_images) + [0] * len(negative_images)
})

# Mezclar datos (Shuffle)
images_df = images_df.sample(frac=1, random_state=42).reset_index(drop=True)

X_paths = images_df['filepath'].values
y = images_df['label'].values

print(f"\n--- VARIABLES FINALES ---")
print(f"X (Rutas): {X_paths.shape}")
print(f"y (Etiquetas): {y.shape}")

# Cargamos las imagenes
def extract_features_hog(paths, size=(64, 64)):
    features = []
    for i, path in enumerate(paths):
        # HOG funciona mejor en escala de grises
        img = Image.open(path).convert('L').resize(size)
        img_array = np.array(img)
        # Extraemos el descriptor HOG
        # orientations=9 es el estándar
        # pixels_per_cell=(8, 8) para capturar detalles finos de las grietas
        fd = hog(img_array, orientations=9, pixels_per_cell=(8, 8),
                 cells_per_block=(2, 2), visualize=False)
        features.append(fd)
        if i % 10000  == 0: print(f"Procesadas {i}/40000 imágenes...")
    return np.array(features)

# Para el Modelo Lineal usamos escala de grises
X_final = extract_features_hog(X_paths)

print(f"X_final shape: {X_final.shape}")
print("Min valor pixel:", X_final.min(), "Max valor pixel:", X_final.max())

# Verificación
assert len(X_final) == len(y), "Error: X e y no tienen el mismo tamaño"


--- VARIABLES FINALES ---
X (Rutas): (40000,)
y (Etiquetas): (40000,)
Procesadas 0/40000 imágenes...
Procesadas 10000/40000 imágenes...
Procesadas 20000/40000 imágenes...
Procesadas 30000/40000 imágenes...
X_final shape: (40000, 1764)
Min valor pixel: 0.0 Max valor pixel: 0.9999999987499999


In [6]:
# 1. Dividimos los datos (60% train, 20% val, 20% test)
# Primero separamos el 20% para test
X_train_val, X_test, y_train_val, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)
# De lo que queda, separamos el 25% para Validación
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)
print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

# 2. Entrenamiento (LinearSVM)
model_svm = LinearSVC(dual=False, random_state=42, C=1.0)
model_svm.fit(X_train, y_train)

# 3. Numero de parametros
n_params = model_svm.coef_.size + model_svm.intercept_.size
print(f"\n Numero de parametros del Modelo Machine Leaning: {n_params}")

# 4. Evaluacion
datasets = [
    ("Train", X_train, y_train),
    ("Validacion", X_val, y_val),
    ("Test", X_test, y_test)
]

for name, X_set, y_true in datasets:
    y_pred = model_svm.predict(X_set)
    acc = accuracy_score(y_true, y_pred)
    print(f"Métricas para: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_true, y_pred, target_names=['Sano (0)', 'Grieta (1)']))

Train: 24000 | Val: 8000 | Test: 8000

 Numero de parametros del Modelo Machine Leaning: 1765
Métricas para: Train
Accuracy: 0.9912
              precision    recall  f1-score   support

    Sano (0)       0.99      0.99      0.99     12000
  Grieta (1)       0.99      0.99      0.99     12000

    accuracy                           0.99     24000
   macro avg       0.99      0.99      0.99     24000
weighted avg       0.99      0.99      0.99     24000

Métricas para: Validacion
Accuracy: 0.9771
              precision    recall  f1-score   support

    Sano (0)       0.97      0.98      0.98      4000
  Grieta (1)       0.98      0.97      0.98      4000

    accuracy                           0.98      8000
   macro avg       0.98      0.98      0.98      8000
weighted avg       0.98      0.98      0.98      8000

Métricas para: Test
Accuracy: 0.9760
              precision    recall  f1-score   support

    Sano (0)       0.97      0.98      0.98      4000
  Grieta (1)       0.98  