# Week 2 - Exploratory Data Analysis (EDA)
This notebook performs visual analysis of the cleaned water quality dataset.
We check data distribution, correlations, and relationships to prepare for modeling.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load dataset
data = pd.read_csv('water_quality.csv', sep=';')
data = data.dropna()
data = data.drop(['id', 'date'], axis=1)

In [None]:
# Create 'Quality' target column
def check_quality(row):
    return int(
        row['NH4'] < 0.5 and row['BSK5'] < 3 and row['Suspended'] < 25 and
        row['O2'] > 5 and row['NO3'] < 10 and row['NO2'] < 0.1 and
        row['SO4'] < 250 and row['PO4'] < 0.1 and row['CL'] < 250
    )
data['Quality'] = data.apply(check_quality, axis=1)

In [None]:
# Count of safe vs unsafe water
sns.countplot(data['Quality'])
plt.title('Safe (1) vs Unsafe (0) Water Samples')
plt.xlabel('Water Quality')
plt.ylabel('Count')
plt.show()

In [None]:
# Histograms of all parameters
data.drop('Quality', axis=1).hist(figsize=(15, 10), bins=20)
plt.suptitle('Histogram of Water Parameters')
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(data.corr(), annot=True, cmap='YlGnBu')
plt.title('Correlation between Parameters')
plt.show()

In [None]:
# Boxplots to detect outliers
for column in data.drop('Quality', axis=1).columns:
    plt.figure()
    sns.boxplot(x=data[column])
    plt.title(f'Boxplot for {column}')
    plt.show()