<a href="https://colab.research.google.com/github/somayeh1404/cod/blob/main/CNN_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q kaggle

from google.colab import files
uploaded = files.upload()

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


!kaggle datasets download -d quynhlecl/lung-cancer-x-ray


!unzip Chest X-Ray Images (lung cancer).zip

!ls

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/quynhlecl/lung-cancer-x-ray
License(s): MIT
Downloading lung-cancer-x-ray.zip to /content
 99% 2.26G/2.28G [00:20<00:00, 121MB/s]
100% 2.28G/2.28G [00:20<00:00, 120MB/s]
/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `unzip Chest X-Ray Images (lung cancer).zip'
lung-cancer-x-ray.zip  sample_data


In [None]:
!pip install torch torchvision



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Subset
import os
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
import zipfile


BATCH_SIZE = 32
DROPOUT = 0.5
WEIGHT_DECAY = 1e-4
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
IMG_SIZE = 224
PATIENCE = 5


train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


def setup_dataset():
    base_path = '/content/lung-cancer-x-ray'
    zip_path = '/content/lung-cancer-x-ray.zip'

    if not os.path.exists(base_path):
        print("Downloading dataset...")
        os.makedirs('/content', exist_ok=True)

        # دانلود از کگل
        os.system('kaggle datasets download -d quynhlecl/lung-cancer-x-ray -p /content/')

        # اکسترکت کردن فایل
        if os.path.exists(zip_path):
            print("Extracting dataset...")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall('/content/')

            # بررسی ساختار پوشه‌ها
            extracted_paths = [
                '/content/lung-cancer-x-ray',
                '/content/Lung Cancer X-ray',
                '/content/dataset'
            ]

            for path in extracted_paths:
                if os.path.exists(path):
                    print(f"Found dataset at: {path}")
                    # اگر ساختار پوشه متفاوت است، آن را به ساختار مورد نظر تغییر می‌دهیم
                    if 'Lung Cancer X-ray' in path:
                        os.rename(path, base_path)
                    return base_path

            # اگر هیچکدام از مسیرهای فوق پیدا نشد، محتوای دایرکتوری را بررسی می‌کنیم
            print("Checking content directory structure...")
            for item in os.listdir('/content'):
                item_path = os.path.join('/content', item)
                if os.path.isdir(item_path) and ('lung' in item.lower() or 'cancer' in item.lower()):
                    print(f"Found potential dataset directory: {item_path}")
                    os.rename(item_path, base_path)
                    return base_path
        else:
            raise FileNotFoundError("Dataset zip file not found after download attempt")

    return base_path

try:
    base_path = setup_dataset()
    train_path = os.path.join(base_path, 'train')
    test_path = os.path.join(base_path, 'test')

    # بررسی وجود پوشه‌های train و test
    if not os.path.exists(train_path):
        print(f"Train path not found: {train_path}")
        print("Available directories in base path:")
        for item in os.listdir(base_path):
            print(f"  - {item}")

        # اگر ساختار متفاوت است، سعی می‌کنیم پوشه‌های مناسب را پیدا کنیم
        for item in os.listdir(base_path):
            item_path = os.path.join(base_path, item)
            if os.path.isdir(item_path):
                subitems = os.listdir(item_path)
                if 'train' in subitems or 'test' in subitems:
                    print(f"Found structured dataset in: {item_path}")
                    train_path = os.path.join(item_path, 'train')
                    test_path = os.path.join(item_path, 'test')
                    break
                elif any('train' in subitem.lower() for subitem in subitems):
                    # پیدا کردن پوشه‌های train و test
                    for subitem in subitems:
                        subitem_path = os.path.join(item_path, subitem)
                        if 'train' in subitem.lower() and os.path.isdir(subitem_path):
                            train_path = subitem_path
                        elif 'test' in subitem.lower() and os.path.isdir(subitem_path):
                            test_path = subitem_path

    print(f"Final train path: {train_path}")
    print(f"Final test path: {test_path}")

    # بارگذاری دیتاست
    train_dataset = datasets.ImageFolder(root=train_path, transform=train_transform)
    test_dataset = datasets.ImageFolder(root=test_path, transform=val_transform)

    print(f"Train classes: {train_dataset.classes}")
    print(f"Test classes: {test_dataset.classes}")
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")

except Exception as e:
    print(f"Error setting up dataset: {e}")
    print("Trying alternative dataset structure...")

    # روش جایگزین: بررسی تمام پوشه‌های ممکن
    possible_paths = [
        '/content/lung-cancer-x-ray',
        '/content/Lung Cancer X-ray',
        '/content/dataset',
        '/content'
    ]

    for path in possible_paths:
        if os.path.exists(path):
            print(f"Checking {path}...")
            for root, dirs, files in os.walk(path):
                for dir_name in dirs:
                    if 'train' in dir_name.lower():
                        train_path = os.path.join(root, dir_name)
                        print(f"Found train directory: {train_path}")
                    if 'test' in dir_name.lower():
                        test_path = os.path.join(root, dir_name)
                        print(f"Found test directory: {test_path}")

    if 'train_path' in locals() and 'test_path' in locals():
        train_dataset = datasets.ImageFolder(root=train_path, transform=train_transform)
        test_dataset = datasets.ImageFolder(root=test_path, transform=val_transform)
    else:
        raise FileNotFoundError("Could not find train and test directories in any expected location")



targets = np.array([label for _, label in train_dataset])
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(sss.split(np.zeros(len(targets)), targets))

train_subset = Subset(train_dataset, train_idx)
val_subset = Subset(train_dataset, val_idx)


class_counts = np.bincount(targets[train_idx])
class_weights = 1. / class_counts
class_weights = torch.tensor(class_weights, dtype=torch.float32)


train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)


model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Dropout(DROPOUT),
    nn.Linear(num_ftrs, 2)
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2)


best_val_accuracy = 0
no_improve = 0

for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_acc = 100 * correct / total


    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_loss = val_loss / len(val_loader)
    val_acc = 100 * val_correct / val_total

    scheduler.step(val_acc)

    print(f'\nEpoch {epoch+1}/{NUM_EPOCHS}:')
    print(f'Train Loss: {train_loss:.4f} | Acc: {train_acc:.2f}%')
    print(f'Val Loss: {val_loss:.4f} | Acc: {val_acc:.2f}%')


    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model.pth')
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print(f'\nEarly stopping at epoch {epoch+1}')
            break


model.load_state_dict(torch.load('best_model.pth'))
model.eval()

all_labels = []
all_preds = []
test_correct = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        test_correct += (predicted == labels).sum().item()
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

test_acc = 100 * test_correct / len(test_dataset)
print(f'\nTest Accuracy: {test_acc:.2f}%')
print('\nClassification Report:')
print(classification_report(all_labels, all_preds, target_names=test_dataset.classes))

print('\nConfusion Matrix:')
print(confusion_matrix(all_labels, all_preds))

Downloading dataset...
Extracting dataset...
Checking content directory structure...
Found potential dataset directory: /content/chest_xray_lung
Final train path: /content/lung-cancer-x-ray/train
Final test path: /content/lung-cancer-x-ray/test
Train classes: ['Cancer', 'NORMAL']
Test classes: ['Cancer', 'NORMAL']
Train dataset size: 5216
Test dataset size: 624




Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 154MB/s]



Epoch 1/10:
Train Loss: 0.1803 | Acc: 93.17%
Val Loss: 0.2507 | Acc: 92.24%

Epoch 2/10:
Train Loss: 0.1230 | Acc: 95.78%
Val Loss: 0.1612 | Acc: 96.17%

Epoch 3/10:
Train Loss: 0.1062 | Acc: 96.09%
Val Loss: 0.0595 | Acc: 97.61%

Epoch 4/10:
Train Loss: 0.0949 | Acc: 96.38%
Val Loss: 0.0633 | Acc: 97.61%

Epoch 5/10:
Train Loss: 0.0891 | Acc: 96.62%
Val Loss: 0.0752 | Acc: 96.46%

Epoch 6/10:
Train Loss: 0.0912 | Acc: 96.50%
Val Loss: 0.1552 | Acc: 93.30%

Epoch 7/10:
Train Loss: 0.0800 | Acc: 96.88%
Val Loss: 0.0520 | Acc: 97.80%

Epoch 8/10:
Train Loss: 0.0497 | Acc: 98.08%
Val Loss: 0.0388 | Acc: 98.08%

Epoch 9/10:
Train Loss: 0.0422 | Acc: 98.54%
Val Loss: 0.0432 | Acc: 98.37%

Epoch 10/10:
Train Loss: 0.0521 | Acc: 98.08%
Val Loss: 0.0476 | Acc: 98.37%

Test Accuracy: 87.02%

Classification Report:
              precision    recall  f1-score   support

      Cancer       0.83      0.99      0.91       390
      NORMAL       0.98      0.67      0.79       234

    accuracy      