In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})  # Map numeric to string

# EDA: Display basic statistics
print("Basic Statistics:")
print(df.describe())

# Plot histograms for each feature
df.iloc[:, :-1].hist(bins=30, figsize=(12, 8))
plt.suptitle('Histograms of Iris Dataset Features', y=1.02)
plt.show()

# Plot pairplot for features colored by species
sns.pairplot(df, hue='species', palette='Set1')
plt.suptitle('Pairplot of Iris Dataset Features', y=1.02)
plt.show()

# Probability Density Function (PDF) and Cumulative Distribution Function (CDF)
# Fix column name issue by stripping whitespace from feature names
df.columns = [col.strip() for col in df.columns]  # Strip whitespace from column names

feature = 'petal length (cm)'  # Choose a feature to analyze
data = df[feature]

# Calculate the PDF
mean, std_dev = np.mean(data), np.std(data)
x = np.linspace(data.min(), data.max(), 100)
pdf = norm.pdf(x, mean, std_dev)

# Calculate the CDF
cdf = norm.cdf(x, mean, std_dev)

# Plot PDF and CDF
plt.figure(figsize=(14, 6))

# Plot PDF
plt.subplot(1, 2, 1)
plt.plot(x, pdf, 'b-', label='PDF')
plt.title('Probability Density Function (PDF)')
plt.xlabel(feature)
plt.ylabel('Density')
plt.legend()

# Plot CDF
plt.subplot(1, 2, 2)
plt.plot(x, cdf, 'r-', label='CDF')
plt.title('Cumulative Distribution Function (CDF)')
plt.xlabel(feature)
plt.ylabel('Cumulative Probability')
plt.legend()

plt.tight_layout()
plt.show()

# For categorical variables, calculate PMF (Note: Not applicable for continuous features like petal length)
# PMF is more relevant for discrete categorical data. Here's an example using the target species
species_counts = df['species'].value_counts(normalize=True)
print("Probability Mass Function (PMF) for species:")
print(species_counts)


AttributeError: module 'numpy' has no attribute '__version__'