## Concrete Crack Images for Classification

---

#### Modelo Lineal

In [15]:
# Cargamos las librerias necesarias
import os
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from google.colab import files

In [2]:
# CONFIGURACIÓN Y DESCARGA DE DATOS
files.upload()
# Mover credenciales (ajusta si es necesario)
if not os.path.exists('/root/.kaggle'):
    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json

# Descargar Dataset: Concrete Crack Images for Classification
# ID del dataset: arunrk7/surface-crack-detection (versión limpia del original de Ozgenel)
print("Descargando dataset...")
!kaggle datasets download -d arunrk7/surface-crack-detection

# Descomprimir
print("Descomprimiendo...")
with zipfile.ZipFile('surface-crack-detection.zip', 'r') as zip_ref:
    zip_ref.extractall('./data')

print("Descarga completada")

Saving kaggle.json to kaggle (1).json
Descargando dataset...
Dataset URL: https://www.kaggle.com/datasets/arunrk7/surface-crack-detection
License(s): copyright-authors
surface-crack-detection.zip: Skipping, found more recently modified local copy (use --force to force download)
Descomprimiendo...
Descarga completada


In [4]:
# CARGA Y CREACIÓN DEL DATAFRAME

# Definir rutas
base_dir = Path('./data')
positive_dir = base_dir / 'Positive' # Grietas
negative_dir = base_dir / 'Negative' # Sin Grietas

# Generar lista de archivos
# Buscamos todas las imágenes .jpg
positive_images = list(positive_dir.glob('*.jpg'))
negative_images = list(negative_dir.glob('*.jpg'))

# Crear DataFrame
# Label 1 = Grieta (Positive), Label 0 = Sin Grieta (Negative)
images_df = pd.DataFrame({
    'filepath': [str(x) for x in positive_images] + [str(x) for x in negative_images],
    'label': [1] * len(positive_images) + [0] * len(negative_images)
})

# Mezclar datos (Shuffle)
images_df = images_df.sample(frac=1, random_state=42).reset_index(drop=True)

X_paths = images_df['filepath'].values
y = images_df['label'].values

print(f"\n--- VARIABLES FINALES ---")
print(f"X (Rutas): {X_paths.shape}")
print(f"y (Etiquetas): {y.shape}")

# Cargamos las imagenes
def load_linear_data(paths, size=(64, 64)):
    X = np.empty((len(paths), size[0] * size[1]), dtype=np.float32)
    for i, path in enumerate(paths):
        # Convertimos a escala de grises ('L') y redimensionamos
        img = Image.open(path).convert('L').resize(size)
        X[i] = np.array(img).flatten() / 255.0
        if i % 10000 == 0: print(f"Cargadas {i} imágenes...")
    return X

# Para el Modelo Lineal usamos escala de grises
X_final = load_linear_data(X_paths)

print(f"X_final shape: {X_final.shape}")
print("Min valor pixel:", X_final.min(), "Max valor pixel:", X_final.max())

# Verificación
assert len(X_final) == len(y), "Error: X e y no tienen el mismo tamaño"


--- VARIABLES FINALES ---
X (Rutas): (40000,)
y (Etiquetas): (40000,)
Cargadas 0 imágenes...
Cargadas 10000 imágenes...
Cargadas 20000 imágenes...
Cargadas 30000 imágenes...
X_final shape: (40000, 4096)
Min valor pixel: 0.019607844 Max valor pixel: 1.0


Entrenamos un modelo de Regresión Logística

In [17]:
# 1. Dividimos los datos (60% train, 20% val, 20% test)
# Primero separamos el 20% para test
X_train_val, X_test, y_train_val, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)
# De lo que queda, separamos el 25% para Validación
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)
print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

# 2. Estandarizamos los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 3. Entrenamiento
model_lr = LogisticRegression(max_iter=500, solver='lbfgs', n_jobs=-1, C=0.1, tol=1e-3)
model_lr.fit(X_train, y_train)

# 4. Numero de parametros
n_params = model_lr.coef_.size + model_lr.intercept_.size
print(f"\n Numero de parametros del Modelo Lineal: {n_params}")

# 5. Evaluacion
datasets = [
    ("Train", X_train, y_train),
    ("Validacion", X_val, y_val),
    ("Test", X_test, y_test)
]

for name, X_set, y_true in datasets:
    y_pred = model_lr.predict(X_set)
    acc = accuracy_score(y_true, y_pred)
    print(f"Métricas para: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_true, y_pred, target_names=['Sano (0)', 'Grieta (1)']))


Train: 24000 | Val: 8000 | Test: 8000

 Numero de parametros del Modelo Lineal: 4097
Métricas para: Train
Accuracy: 0.9122
              precision    recall  f1-score   support

    Sano (0)       0.88      0.96      0.92     12000
  Grieta (1)       0.95      0.87      0.91     12000

    accuracy                           0.91     24000
   macro avg       0.92      0.91      0.91     24000
weighted avg       0.92      0.91      0.91     24000

Métricas para: Validacion
Accuracy: 0.9114
              precision    recall  f1-score   support

    Sano (0)       0.88      0.96      0.92      4000
  Grieta (1)       0.95      0.86      0.91      4000

    accuracy                           0.91      8000
   macro avg       0.92      0.91      0.91      8000
weighted avg       0.92      0.91      0.91      8000

Métricas para: Test
Accuracy: 0.9070
              precision    recall  f1-score   support

    Sano (0)       0.87      0.96      0.91      4000
  Grieta (1)       0.96      0.85 