<a href="https://colab.research.google.com/github/sahilfatima/Heat-Flux-Data/blob/main/Heat_flux_data_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s3e15:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F51982%2F5760919%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240201%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240201T163324Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dbf6da2b669b989b117ac06338e3a9c714b432cb13f557ba5db248321c79507442bf7ab53d1a9c26fa6b99f675442f416d0f8854642f025b7dea078aa2926397933d0d787071423880213c001ec970add7bd8c9da1570568ca33e8c425ab7d18381e3d8bbf89ba216bc9d0b0b86969c0d2526a4620bdd0ead52365af1cdb3865f7032a4b52b8b81d50a2c630ae2ede58fe15f260c57bedee31753c242cce899734fd1f896cc3ff13a503c171fe1941e3e8c2f3ba4a6ced24b8873399a2af1b6e405839e9772c2db18112262e641a7fbf54a24401bb343e597a784b9c711c9e238a24ebf75507c4fe38c796141ed37b774e17375aaa2a667f85f5e7be71ee18559'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform

In [None]:
data= pd.read_csv("/kaggle/input/playground-series-s3e15/data.csv")
data.head()

In [None]:
data.info()

In [None]:
correlation = data.corr(numeric_only=True)

In [None]:
sns.heatmap(data.corr(numeric_only=True), annot=True, cmap="coolwarm", linecolor="black")

In [None]:
data['author'].nunique()

In [None]:
author = data['author'].value_counts()

sorted_auth = author.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.countplot(x='author', data=data, order=sorted_auth.index)

plt.xlabel("Author Name")
plt.ylabel("Count")
plt.show()

In [None]:
geometry = data['geometry'].value_counts()

sorted_geometry = geometry.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.countplot(x='geometry', data=data, order=sorted_geometry.index, palette='muted')

plt.xlabel("Geometry")
plt.ylabel("Count")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (16, 5))

ax.pie(
    data['author'].value_counts(),
    shadow = True,
    explode = [.1 for i in range(0, 10)],
    autopct = '%1.f%%',
    textprops = {'size' : 14, 'color' : 'white'}
)

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (16, 5))

ax.pie(
    data['geometry'].value_counts(),
    shadow = True,
    explode = [.1 for i in range(0, 3)],
    autopct = '%1.f%%',
    textprops = {'size' : 14, 'color' : 'white'}
)

In [None]:
data.isnull().sum()

In [None]:

target_corr = correlation['x_e_out [-]'].drop('x_e_out [-]')
target_corr_sorted = target_corr.sort_values(ascending=False)

sns.set_style("white")
sns.set_palette("PuBuGn_d")
sns.heatmap(target_corr_sorted.to_frame(), cmap="coolwarm", annot=True, fmt='.2f')
plt.title("Correlation matrix with target variable")
plt.show()

## **Distribution of numerical features**

## ***1. Histplot***

In [None]:
num_cols = data.select_dtypes(include = ['float64']).columns.tolist()
for col in num_cols:
        sns.histplot(data[col], kde=True)
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.title('Distribution of {}'.format(col))
        plt.show()

## ***2. KDE plot***

In [None]:
ig, ax = plt.subplots(7, 1, figsize = (7, 20), dpi = 300)
ax = ax.flatten()
pal = sns.color_palette('viridis')
for i, column in enumerate(num_cols):
    sns.kdeplot(data[column], ax=ax[i], color=pal[0])


    ax[i].set_title(f'{column} Distribution', size = 7)
    ax[i].set_xlabel(None)
    ax[i].set_ylabel(None)

fig.suptitle('Distribution of Feature\n\n', fontsize = 15, fontweight = 'bold')

plt.tight_layout()

In [None]:
data.groupby(['geometry', 'author', 'length [mm]']).nunique()

In [None]:
data.groupby(['geometry', 'author']).nunique()

## **Calculate feature distance in dataset**

In [None]:
corr = data[num_cols].corr(method = 'spearman')
distance = squareform(1 - abs(corr))
print(distance)
linkage_matrix = linkage(distance, method='complete')
print(linkage_matrix)
plt.figure(figsize = (10, 8))
dendro = dendrogram(linkage_matrix, labels=data[num_cols].columns, leaf_rotation=90)
plt.title(f'Feature Distance in Dataset', weight = 'bold', size = 22)
plt.show()