<a href="https://colab.research.google.com/github/salmakhalfallah/LibraryDBSystem/blob/main/Water_Dataset_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Water Dataset Analysis in Google Colab

# Install required packages
!pip install scipy
!pip install matplotlib
!pip install numpy
!pip install seaborn

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import loadmat
from scipy import stats
import pandas as pd

# For Google Colab file upload
from google.colab import files

print("All packages installed and imported successfully!")

# Upload the .mat file
uploaded = files.upload()

# Get the filename
filename = list(uploaded.keys())[0]
print(f"Uploaded file: {filename}")

# Load the .mat file
def load_mat_file(file_path):
    """Load MATLAB .mat file and explore its structure"""
    try:
        data = loadmat(file_path)
        print("File loaded successfully!")

        # Display keys in the file
        print("\nKeys in the .mat file:")
        for key in data.keys():
            if not key.startswith('__'):
                print(f"- {key}")

        # Display information about each variable
        print("\nVariable information:")
        for key in data.keys():
            if not key.startswith('__'):
                value = data[key]
                print(f"{key}: {type(value)}, shape: {np.shape(value) if hasattr(value, 'shape') else 'N/A'}")

        return data
    except Exception as e:
        print(f"Error loading file: {e}")
        return None

# Load and explore the data
mat_data = load_mat_file(filename)

# Extract the main data (assuming it's the largest array)
def extract_main_data(mat_data):
    """Extract the main data array from the .mat file"""
    data_arrays = {}

    for key in mat_data.keys():
        if not key.startswith('__'):
            value = mat_data[key]
            if isinstance(value, np.ndarray):
                data_arrays[key] = value

    # Find the largest array (likely the main dataset)
    if data_arrays:
        main_key = max(data_arrays.keys(), key=lambda k: data_arrays[k].size)
        main_data = data_arrays[main_key]
        print(f"\nMain data array: {main_key}, shape: {main_data.shape}")
        return main_data, main_key
    else:
        print("No numpy arrays found in the .mat file")
        return None, None

main_data, data_key = extract_main_data(mat_data)

if main_data is not None:
    # Basic data exploration
    print(f"\nData type: {main_data.dtype}")
    print(f"Data shape: {main_data.shape}")
    print(f"Number of elements: {main_data.size}")
    print(f"Data range: [{np.nanmin(main_data):.4f}, {np.nanmax(main_data):.4f}]")
    print(f"Mean: {np.nanmean(main_data):.4f}")
    print(f"Standard deviation: {np.nanstd(main_data):.4f}")

    # Check for NaN values
    nan_count = np.sum(np.isnan(main_data))
    print(f"NaN values: {nan_count} ({nan_count/main_data.size*100:.2f}%)")

# Data Visualization
def visualize_data(data, title="Water Dataset"):
    """Create various visualizations of the data"""

    # Flatten the data for some visualizations
    flat_data = data.flatten()
    flat_data = flat_data[~np.isnan(flat_data)]  # Remove NaN values

    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")

    # Create subplots
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle(f'Analysis of {title}', fontsize=16, fontweight='bold')

    # 1. Histogram
    axes[0, 0].hist(flat_data, bins=50, alpha=0.7, edgecolor='black')
    axes[0, 0].set_title('Distribution of Values')
    axes[0, 0].set_xlabel('Value')
    axes[0, 0].set_ylabel('Frequency')

    # 2. Box plot
    axes[0, 1].boxplot(flat_data)
    axes[0, 1].set_title('Box Plot')
    axes[0, 1].set_ylabel('Value')

    # 3. Heatmap (if 2D data)
    if len(data.shape) == 2:
        im = axes[0, 2].imshow(data, aspect='auto', cmap='viridis')
        axes[0, 2].set_title('Heatmap')
        plt.colorbar(im, ax=axes[0, 2])
    else:
        axes[0, 2].text(0.5, 0.5, 'Not 2D data\nfor heatmap',
                       ha='center', va='center', transform=axes[0, 2].transAxes)
        axes[0, 2].set_title('Heatmap (Not Available)')

    # 4. Cumulative distribution
    axes[1, 0].hist(flat_data, bins=50, cumulative=True,
                   density=True, alpha=0.7, edgecolor='black')
    axes[1, 0].set_title('Cumulative Distribution')
    axes[1, 0].set_xlabel('Value')
    axes[1, 0].set_ylabel('Cumulative Probability')

    # 5. QQ plot (check for normality)
    stats.probplot(flat_data, dist="norm", plot=axes[1, 1])
    axes[1, 1].set_title('Q-Q Plot (Normality Check)')

    # 6. Time series (if temporal data)
    if len(data.shape) == 1:
        axes[1, 2].plot(data)
        axes[1, 2].set_title('Time Series')
        axes[1, 2].set_xlabel('Index')
        axes[1, 2].set_ylabel('Value')
    elif len(data.shape) == 2:
        # Plot mean across one dimension
        mean_series = np.nanmean(data, axis=1)
        axes[1, 2].plot(mean_series)
        axes[1, 2].set_title('Mean Across Dimension')
        axes[1, 2].set_xlabel('Index')
        axes[1, 2].set_ylabel('Mean Value')
    else:
        axes[1, 2].text(0.5, 0.5, 'Complex data structure\nCannot plot time series',
                       ha='center', va='center', transform=axes[1, 2].transAxes)
        axes[1, 2].set_title('Time Series (Not Available)')

    plt.tight_layout()
    plt.show()

# Advanced analysis functions
def advanced_analysis(data):
    """Perform advanced statistical analysis"""
    flat_data = data.flatten()
    flat_data = flat_data[~np.isnan(flat_data)]

    print("=== ADVANCED STATISTICAL ANALYSIS ===")
    print(f"Skewness: {stats.skew(flat_data):.4f}")
    print(f"Kurtosis: {stats.kurtosis(flat_data):.4f}")

    # Normality test
    stat, p_value = stats.normaltest(flat_data)
    print(f"Normality test p-value: {p_value:.4f}")
    if p_value > 0.05:
        print("Data appears normally distributed (p > 0.05)")
    else:
        print("Data does not appear normally distributed (p <= 0.05)")

    # Outlier detection using IQR method
    Q1 = np.percentile(flat_data, 25)
    Q3 = np.percentile(flat_data, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = flat_data[(flat_data < lower_bound) | (flat_data > upper_bound)]
    print(f"Number of outliers (IQR method): {len(outliers)} ({len(outliers)/len(flat_data)*100:.2f}%)")

# Execute the analysis
if main_data is not None:
    # Basic visualization
    visualize_data(main_data, title=f"Water Dataset ({data_key})")

    # Advanced analysis
    advanced_analysis(main_data)

    # Additional analysis based on data shape
    if len(main_data.shape) == 2:
        print("\n=== 2D DATA ANALYSIS ===")
        print(f"Row means: {np.nanmean(main_data, axis=1)}")
        print(f"Column means: {np.nanmean(main_data, axis=0)}")

        # Correlation matrix if reasonable size
        if main_data.shape[1] < 20:  # Only if not too many columns
            corr_matrix = np.corrcoef(main_data, rowvar=False)
            plt.figure(figsize=(10, 8))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
            plt.title('Correlation Matrix')
            plt.show()

# Save processed data
def save_processed_data(data, filename):
    """Save processed data to CSV"""
    if len(data.shape) == 1:
        df = pd.DataFrame(data, columns=['value'])
    elif len(data.shape) == 2:
        df = pd.DataFrame(data)
    else:
        print("Data has more than 2 dimensions, cannot save as CSV")
        return

    csv_filename = filename.replace('.mat', '_processed.csv')
    df.to_csv(csv_filename, index=False)
    print(f"Processed data saved as {csv_filename}")

    # Download the file
    files.download(csv_filename)

# Save the processed data
if main_data is not None and len(main_data.shape) <= 2:
    save_processed_data(main_data, filename)

print("\n=== ANALYSIS COMPLETE ===")
print("The water dataset has been successfully loaded, analyzed, and visualized!")

All packages installed and imported successfully!
