In [None]:
import pandas as pd
from sklearn.datasets import load_iris
# Load Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
print("First 5 rows of the dataset:")
print(df.head())

#### Standardization and Normalization

In [None]:
# Standardization
from sklearn.preprocessing import StandardScaler
# Features to standardize
features = iris.feature_names
# Initialize the scaler
scaler = StandardScaler()
# Fit and transform the data
df_standardized = df.copy()
df_standardized[features] = scaler.fit_transform(df[features])
print("\nFirst 5 rows after standardization:")
print(df_standardized.head())

In [None]:
# Normalization
from sklearn.preprocessing import MinMaxScaler
# Initialize the scaler
min_max_scaler = MinMaxScaler()
# Fit and transform the data
df_normalized = df.copy()
df_normalized[features] = min_max_scaler.fit_transform(df[features])
print("\nFirst 5 rows after normalization:")
print(df_normalized.head())


## Transformation

In [None]:
import numpy as np
# Apply log transformation (adding a small constant to avoid log(0))
df_transformed = df.copy()
df_transformed[features] = np.log1p(df[features])
print("\nFirst 5 rows after log transformation:")
print(df_transformed.head())

## Aggregation (Group By and Aggregate, Pivot Table)

In [None]:
# Group by species and calculate mean
df_grouped_mean = df.groupby('species').mean()
print("\nMean of features grouped by species:")
print(df_grouped_mean)


In [None]:
# Pivot Table
pivot_table = pd.pivot_table(df, values=features, index='species', aggfunc='mean')
print("\nPivot table with mean values:")
print(pivot_table)


In [None]:
'''Discretization (Binning) Discretization involves converting continuous
variables into categorical bins. For instance, we can bin sepal
length into categories like "Short", "Medium", and "Long".'''

In [None]:
# Define bins and labels
bins = [0, 5, 6, 7, 8]
labels = ['Very Short', 'Short', 'Medium', 'Long']
# Create a new column with binned sepal length
df['sepal length category'] = pd.cut(df['sepal length (cm)'], bins=bins, labels=labels)
print("\nFirst 5 rows with discretized sepal length:")
print(df[['sepal length (cm)', 'sepal length category']].head())


In [None]:
'''Binarization: Binarization converts numerical features into binary
values based on a threshold. For example, we can binarize petal length
where values above the median are marked as 1 and others as 0.'''

In [None]:
from sklearn.preprocessing import Binarizer
# Initialize the binarizer with threshold as the median
median_petal_length = df['petal length (cm)'].median()
binarizer = Binarizer(threshold=median_petal_length)
# Apply binarization
df['petal length binarized'] = binarizer.fit_transform(df[['petal length (cm)']])
print("\nFirst 5 rows with binarized petal length:")
print(df[['petal length (cm)', 'petal length binarized']].head())


#### Sampling

In [None]:
# Random Sampling
# Set a random seed for reproducibility
random_seed = 42
# Randomly sample 10% of the data
df_sampled = df.sample(frac=0.1, random_state=random_seed)
print("\nRandomly sampled 10% of the data:")
print(df_sampled)

In [None]:
#Stratified Sampling
from sklearn.model_selection import train_test_split
# Perform stratified sampling
train, test = train_test_split(df, test_size=0.1, stratify=df['species'], random_state = 45)
print("\nStratified sampled 10% of the data:")
print(test['species'].value_counts())