<a href="https://colab.research.google.com/github/rusave/radcv/blob/main/Kopie_se%C5%A1itu_01RAD_ex00_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 01RAD - Exercise 00

* Python, Jupyter NB
* Data handling
* Data wrangling

We assume basic knowledge of Python and work with Jupyter NB


## Setup

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statistics as st

In [None]:
# Suppress FutureWarnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [None]:
# Load the Trees dataset from the statsmodels built-in datasets
trees = sm.datasets.get_rdataset("trees").data

# Display the first few rows of the dataset
trees.head()

In [None]:
? trees

In [None]:
trees.describe()

In [None]:
trees

# How to run R in Jupyter notebook with Python kernel?

We will use it in cases where the Python package will not be an adequate replacement for the R package.

In [None]:
# Install rpy2 and use R magic cell
!pip install rpy2

%load_ext rpy2.ipython

In [None]:
%%R -o trees_R
data(trees)
head(trees)


trees_R <- trees
? trees

In [None]:
# Reset the index to remove index labels from both DataFrames and convert to float
trees_numeric = trees.reset_index(drop=True).astype(float)
trees_R_numeric = trees_R.reset_index(drop=True).astype(float)

# Use np.isclose to compare with a tolerance for floating-point numbers
comparison = np.isclose(trees_numeric, trees_R_numeric)

# Check if all elements are close enough (numerically identical)
comparison.all()


# First Task - Data handling

* Load the `trees` dataframe
* Calculate the mean and variance of the height and volume of the trees.
* Compare the result from built-in functions with a "manual" calculation using the formula.
* Check `? trees` and convert the variable Girth to $cm$, Height to $m$, and Volume to $m^3$.
* Calculate the BMI index of the tree assuming that the density of the wood of all listed trees is constant and equal to 900 kg / m$^3$. Save the result in the `trees` table as a new variable `BMI`.
* Look at the table `table` of the calculated values, divide it into 3 groups `{thin, normal, obese}` - and assign a factor-type variable to each record.
   Example: Choose break1 and break2 values so that they make sense for you.
* Use quantile binning in the previous task
* Visualise data

In [None]:
# Calculate mean and variance of Height and Volume
mean_height = trees['Height'].mean()
var_height = trees['Height'].var()

mean_volume = trees['Volume'].mean()
var_volume = trees['Volume'].var()

print(f"Mean Height: {mean_height}, Variance Height: {var_height}")
print(f"Mean Volume: {mean_volume}, Variance Volume: {var_volume}")


In [None]:
# Manual calculation of mean and variance
manual_mean_height = trees['Height'].sum() / len(trees['Height'])
manual_var_height = sum((trees['Height'] - manual_mean_height) ** 2) / (len(trees['Height']) - 1)

manual_mean_volume = trees['Volume'].sum() / len(trees['Volume'])
manual_var_volume = sum((trees['Volume'] - manual_mean_volume) ** 2) / (len(trees['Volume']) - 1)

print(f"Manual Mean Height: {manual_mean_height}, Manual Variance Height: {manual_var_height}")
print(f"Manual Mean Volume: {manual_mean_volume}, Manual Variance Volume: {manual_var_volume}")


In [None]:
def mean_of_column(df, column_name):
    total_sum = df[column_name].sum()
    num_observations = len(df[column_name])
    mean = total_sum / num_observations
    return mean

def variance_of_column(df, column_name):
    mean = mean_of_column(df, column_name)
    num_observations = len(df[column_name])
    total_sum = 0
    for i in range(num_observations):
        total_sum = total_sum + (df[column_name][i] - mean)**2
    variance = total_sum / (num_observations-1)
    return variance

print('Manual Mean Height:', mean_of_column(trees, 'Height'))
print('Manual Variance Height::', variance_of_column(trees, 'Height'))


In [None]:
def convert(df):
    df['Girth_cm'] = df['Girth'] * 2.54            # from inches to centimeters
    df['Height_m'] = df['Height'] * 0.3048         # from feet to cmeters
    df['Volume_m3'] = df['Volume'] * (0.3048**3)   # from feet**3 to meters**3
    return df

trees = convert(trees)
trees.head()


In [None]:
# Calculate BMI
density = 900  # kg/m^3
trees['BMI'] = (trees['Volume_m3'] * density) / (trees['Height_m'] ** 2)

print(trees[['Girth_cm', 'Height_m', 'Volume_m3', 'BMI']].head())


In [None]:
sns.histplot(trees, x="BMI", kde=True, color="red", label="BMI", bins=20, element='step')
#kde= kernel density estimation
#pozor na jadrove odhady a nekonecny nosic, abych pak pri generovani nedostala nerealnou hodnotu


In [None]:
# Define breakpoints for BMI categories
breakpoints = [trees['BMI'].min()-1, 1, 2, trees['BMI'].max()+1]
labels = ['thin', 'normal', 'obese']

# Create a new column with BMI categories
trees['BMI_category'] = pd.cut(trees['BMI'], bins=breakpoints, labels=labels)

print(trees[['Girth_cm', 'Height_m', 'Volume_m3', 'BMI', 'BMI_category']].head())


In [None]:
#  Group trees into 'thin', 'normal', 'obese' based on BMI
break1, break2 = trees['BMI'].quantile([0.33, 0.66])
conditions = [
    (trees['BMI'] <= break1),
    (trees['BMI'] > break1) & (trees['BMI'] <= break2),
    (trees['BMI'] > break2)
]
choices = ['thin', 'normal', 'obese']
trees['BMI_category_q'] = np.select(conditions, choices)
trees

In [None]:

# Create a figure and axis
plt.figure(figsize=(10, 6))

# Boxplot for Girth by BMI_category
sns.boxplot(x='BMI_category', y='Girth', data=trees)

# Add jitter using a stripplot
sns.stripplot(x='BMI_category', y='Girth', data=trees, color='black', jitter=True, alpha=0.6)

# Add means to the plot using pointplot
# sns.pointplot(x='BMI_category', y='Girth', data=trees, estimator='mean', color='red', markers='D', ci=None)

# Add title and labels
plt.title('Boxplot with Jitter and Mean for Girth by BMI Category')
plt.xlabel('BMI Category')
plt.ylabel('Girth (inches)')

# Show the plot
plt.show()


In [None]:
# Create a figure and axis
plt.figure(figsize=(10, 6))

# Violin plot for Girth by BMI_category
sns.violinplot(x='BMI_category', y='Girth', data=trees, inner=None)

# Add jitter using a stripplot
sns.stripplot(x='BMI_category', y='Girth', data=trees, color='black', jitter=True, alpha=0.6)

# Add title and labels
plt.title('Violin Plot with Jitter and Mean for Girth by BMI Category')
plt.xlabel('BMI Category')
plt.ylabel('Girth (inches)')

# Show the plot
plt.show()


In [None]:
#mulitomodalni rozdeleni = hustota ma vice kopecku