In [16]:
import pandas as pd
import numpy as np

nazwa_pliku = 'AirQualityUCI.csv'

try:

    df = pd.read_csv(nazwa_pliku, sep=';', decimal=',')
    
    ile_pustych = df.isna().all(axis=1).sum()
    print(f"Liczba całkowicie pustych wierszy: {ile_pustych}")
    df.dropna(how='all', inplace=True)
    df = df.iloc[:, :-2]
    
    print("\n--- Informacje o wymiarach ---")
    wiersze, kolumny = df.shape
    print(f"Liczba wierszy: {wiersze}")
    print(f"Liczba kolumn: {kolumny}")
    
    # Odczyt "-200" to błąd czujnika
    df.replace(-200, np.nan, inplace=True)
    # Uzupełnianie braków metodą interpolacji liniowej
    cols_numeric = df.select_dtypes(include=[np.number]).columns
    df[cols_numeric] = df[cols_numeric].interpolate(method='linear', limit_direction='both')
    
    liczba_wierszy_z_nan = df.isna().any(axis=1).sum()
    print(f"Liczba wierszy z brakującymi danymi: {liczba_wierszy_z_nan}")
   
    print("\n--- Pierwsze 5 wierszy ---")
    print(df.head())
    
    print("\n--- Ciągłość czasu ---")
    # Tworzę dodatkową kolumną Datetime
    df['Time_Clean'] = df['Time'].astype(str).str.replace('.', ':', regex=False)
    df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time_Clean'], dayfirst=True)
    # Generuję pełny zakres DateTime
    full_range = pd.date_range(start=df['Datetime'].min(), end=df['Datetime'].max(), freq='H')
    # Porównuję faktyczne dane w zbiorze z tymi wygenerowanymi
    missing_dates = set(full_range) - set(df['Datetime'])
    
    print(f"\n--- Brakujące odczyty (łącznie: {len(missing_dates)}) ---")
    if missing_dates:
        print(f"Liczba brakujących wpisów: {len(missing_dates)}")
        print("Wszystkie brakujące daty:")
        print(*sorted(missing_dates), sep='\n')
    else:
        print("Brak brakujących dni/godzin w ciągłości odczytów.")
        
    print("\n--- Ujemne wartości ---")
    maska_ujemne = (df[cols_numeric] < 0).any(axis=1)
    negative_rows = df[maska_ujemne]
    
    print(f"\n--- Wiersze z ujemnymi wartościami: {len(negative_rows)} ---")
    if not negative_rows.empty:
        print(negative_rows[cols_numeric])
        print("\nJeśli ujemne wartości są tylko w kolumnie T (Temperatura w stopniach Celcjusza), to jest to poprawnie.")
    else:
        print("Brak ujemnych wartości po interpolacji.")
        
    #Usunięcie niepotrzebnych kolumn
    df.drop(columns=['Datetime', 'Time_Clean'], inplace=True)
    print(f"Aktualna liczba kolumn: {df.shape[1]}")
    
    print("\n--- Podział na zbiory train, validation i test ---")
    temp_time = df['Time'].astype(str).str.replace('.', ':', regex=False)
    df['Temp_Datetime'] = pd.to_datetime(df['Date'] + ' ' + temp_time, dayfirst=True)
    df.sort_values(by='Temp_Datetime', inplace=True)
    
    n = len(df)
    train_end = int(0.70 * n)
    val_end = int(0.85 * n)
    
    # Tworzenie kolumny Set
    df['Set'] = 'test' # Domyślnie wypełnia 'test' (dla ostatnich 15%)
    df.iloc[:train_end, df.columns.get_loc('Set')] = 'train'         # Pierwsze 70%
    df.iloc[train_end:val_end, df.columns.get_loc('Set')] = 'validation' # Kolejne 15%
    df.drop(columns=['Temp_Datetime'], inplace=True)
    print(df['Set'].value_counts(sort=False))
    
    df.to_excel("AirQualityUCI_outcome.xlsx", index=False)
    print("\n--> Zapisano gotowy plik: AirQualityUCI_outcome.xlsx.xlsx")

except FileNotFoundError:
    print("Nie znaleziono pliku. Sprawdź czy nazwa i ścieżka są poprawne.")
except Exception as e:
    print(f"Wystąpił błąd: {e}")

Liczba całkowicie pustych wierszy: 114

--- Informacje o wymiarach ---
Liczba wierszy: 9357
Liczba kolumn: 15
Liczba wierszy z brakującymi danymi: 0

--- Pierwsze 5 wierszy ---
         Date      Time  CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  \
0  10/03/2004  18.00.00     2.6       1360.0     150.0      11.9   
1  10/03/2004  19.00.00     2.0       1292.0     112.0       9.4   
2  10/03/2004  20.00.00     2.2       1402.0      88.0       9.0   
3  10/03/2004  21.00.00     2.2       1376.0      80.0       9.2   
4  10/03/2004  22.00.00     1.6       1272.0      51.0       6.5   

   PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)  \
0         1046.0    166.0        1056.0    113.0        1692.0       1268.0   
1          955.0    103.0        1174.0     92.0        1559.0        972.0   
2          939.0    131.0        1140.0    114.0        1555.0       1074.0   
3          948.0    172.0        1092.0    122.0        1584.0       1203.0   
4          836.0   

## Nasza sieć

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = pd.read_excel("AirQualityUCI_outcome.xlsx")

features = ['PT08.S1(CO)','NMHC(GT)','C6H6(GT)','PT08.S2(NMHC)','NOx(GT)',
            'PT08.S3(NOx)','NO2(GT)','PT08.S4(NO2)','PT08.S5(O3)','T','RH','AH']
target = 'CO(GT)'

# Tworzymy zbiory według kolumny 'Set'
X_train = data[data['Set']=='train'][features].values
y_train = data[data['Set']=='train'][target].values.reshape(-1,1)

X_validation = data[data['Set']=='validation'][features].values
y_validation = data[data['Set']=='validation'][target].values.reshape(-1,1)

X_test = data[data['Set']=='test'][features].values
y_test = data[data['Set']=='test'][target].values.reshape(-1,1)

# Standaryzacja
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_validation = scaler_X.transform(X_validation)
X_test = scaler_X.transform(X_test)

scaler_y = StandardScaler()
y_train = scaler_y.fit_transform(y_train)
y_validation = scaler_y.transform(y_validation)
y_test = scaler_y.transform(y_test)

print(f"Train: X={X_train.shape}, y={y_train.shape}")
print(f"Validation: X={X_validation.shape}, y={y_validation.shape}")
print(f"Test: X={X_test.shape}, y={y_test.shape}")


Train: X=(6549, 12), y=(6549, 1)
Validation: X=(1404, 12), y=(1404, 1)
Test: X=(1404, 12), y=(1404, 1)


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

def relu(x, derivative=False):
    if derivative:
        return (x > 0).astype(float)
    return np.maximum(0, x)

# Inicjalizacja wag sieci dla wielu warstw ukrytych
# def initialize_weights(input_size, hidden_layers_sizes, output_size):
#     weights = []
#     layer_sizes = [input_size] + hidden_layers_sizes + [output_size]
#     for i in range(len(layer_sizes) - 1):
#         weights.append(2 * np.random.random((layer_sizes[i], layer_sizes[i+1])) - 1)
#     return weights

def initialize_weights(input_size, hidden_layers_sizes, output_size):
    weights = []
    layer_sizes = [input_size] + hidden_layers_sizes + [output_size]
    for i in range(len(layer_sizes) - 1):
        weights.append(np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2 / layer_sizes[i]))
    return weights

# Podział danych na zbiory treningowe, generalizacyjne i walidacyjne
def split_data(data, train_ratio=0.6, validation_ratio=0.2):
    np.random.shuffle(data) # tasowanie danych
    
    train_size = int(len(data) * train_ratio) 
    validation_size = int(len(data) * validation_ratio)

    train_data = data[:train_size] # wybiera obserwacje do liczby "train_size"
    validation_data = data[train_size:train_size + validation_size] # wybiera obserwacje od "train_size" do sumy "train_size" i "validation_size"
    test_data = data[train_size + validation_size:] # wybiera obserwacje od powyzszej sumy do końca

    return train_data, validation_data, test_data

# Funkcja dostosowująca tempa nauki
def adjust_learning_rate(learning_rate, mse, previous_mse, learning_rate_adjust, threshold=1e-6):
    if mse < previous_mse:
        learning_rate *= 1.05
    else:
        learning_rate *= 0.7

    if abs(mse - previous_mse) < threshold:
        learning_rate *= learning_rate_adjust

    return learning_rate

# Trening sieci z wieloma warstwami ukrytymi
def train(X, y, learning_rate, learning_rate_adjust, epochs,
          hidden_layers_sizes, optimizer, momentum):

    input_size = X.shape[1]
    output_size = y.shape[1]
    weights = initialize_weights(input_size, hidden_layers_sizes, output_size)
    velocities = [np.zeros_like(w) for w in weights]
    prev_loss = np.inf

    for epoch in range(epochs):
        # Forward pass
        activations = [X]
        zs = []
        for i, w in enumerate(weights):
            z = np.dot(activations[-1], w)
            if i == len(weights) - 1:
                activations.append(z)   # brak softmax
            else:
                activations.append(relu(z))
        predicted_output = activations[-1]

        # Backpropagation
        error = predicted_output - y
        loss = np.mean(error ** 2)

        learning_rate = adjust_learning_rate(
            learning_rate,
            loss,
            prev_loss,
            learning_rate_adjust
        )
        prev_loss = loss

        deltas = [error] 
        for i in range(len(weights) - 1, 0, -1):
            delta = deltas[-1].dot(weights[i].T) * relu(activations[i], derivative=True)
            deltas.append(delta)

        deltas.reverse() 

        # Aktualizacja wag
        for i in range(len(weights)):
            grad = activations[i].T.dot(deltas[i]) / X.shape[0]

            if optimizer == 'gd':
                weights[i] -= learning_rate * grad

            elif optimizer == 'momentum':
                velocities[i] = momentum * velocities[i] - learning_rate * grad
                weights[i] += velocities[i]

    return weights


def predict(X, weights):
    output = X
    for i, w in enumerate(weights):
        output = np.dot(output, w)
        if i < len(weights) - 1:
            output = relu(output)
        else:
            output = output  # liniowe wyjście
    return output

def mae_np(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

# Parametry sieci
learning_rates = [0.01]
learning_rate_adjusts = [0.0005]
epochses = [1000]
repeat = 3
optimizers = ['gd', 'momentum']
momentums = [0.0, 0.9]
# gd → zwykły gradient prosty
# momentum → gradient z momentem

# Warstwy
hidden_layers_sizes_list = [
    [10],         
    [10, 10]       
]

def calculate_regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, mae, r2

# Przechowywanie wyników dla różnych konfiguracji warstw
results = []

# Testowanie
for hidden_layers_sizes in hidden_layers_sizes_list:  
    for r in range(1, repeat + 1):
        for lr in learning_rates:
            for lr_adj in learning_rate_adjusts:
                for epochs in epochses:
                    for optimizer in optimizers:
                        for momentum in momentums:
                            trained_weights = train(
                                X_train, y_train, lr, lr_adj, epochs, hidden_layers_sizes, optimizer=optimizer, momentum=momentum
                            )
                            predictions_train = predict(X_train, trained_weights)
                            predictions_validation = predict(X_validation, trained_weights)
                            predictions_test = predict(X_test, trained_weights)

                            # Odwracanie skalowania y
                            y_pred_test_real = scaler_y.inverse_transform(predictions_test)
                            y_test_real = scaler_y.inverse_transform(y_test)

                            # Obliczamy metryki
                            mse, mae, r2 = calculate_regression_metrics(y_test, predictions_test)

                            # Dodanie wyników do tabeli
                            results.append({
                                'hidden_layers': str(hidden_layers_sizes),
                                'optimizer': optimizer,
                                'momentum': momentum,
                                'learning_rate': lr,
                                'learning_rate_adjust': lr_adj,
                                'epochs': epochs,
                                'repeat': r,
                                'mse': mse,
                                'mae': mae,
                                'r2': r2
                            })

# Tworzenie DataFrame z wynikami
results_df = pd.DataFrame(results)

# Wyświetlanie wyników
print(results_df)
print(results_df.head())

summary = results_df.groupby(
    ['hidden_layers', 'learning_rate', 'optimizer', 'momentum']
).agg(
    avg_mse=('mse', 'mean'),
    avg_mae=('mae', 'mean'),
    avg_r2=('r2', 'mean'),
    best_mse=('mse', 'min'),
    best_r2=('r2', 'max')
).reset_index()

print(summary)


   hidden_layers optimizer  momentum  learning_rate  learning_rate_adjust  \
0           [10]        gd       0.0           0.01                0.0005   
1           [10]        gd       0.9           0.01                0.0005   
2           [10]  momentum       0.0           0.01                0.0005   
3           [10]  momentum       0.9           0.01                0.0005   
4           [10]        gd       0.0           0.01                0.0005   
5           [10]        gd       0.9           0.01                0.0005   
6           [10]  momentum       0.0           0.01                0.0005   
7           [10]  momentum       0.9           0.01                0.0005   
8           [10]        gd       0.0           0.01                0.0005   
9           [10]        gd       0.9           0.01                0.0005   
10          [10]  momentum       0.0           0.01                0.0005   
11          [10]  momentum       0.9           0.01                0.0005   