Question 1: Understanding the Dataset 
<br>
Description: Load a dataset and understand its basic properties including data types dimensions, and first few rows

In [None]:

# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Display the data types of each column
print("\nData types of each column:")
print(df.dtypes)

# Display the dimensions of the dataset
print(f"\nDimensions of the dataset: {df.shape}")



Question 2: Checking for Missing Values
<br>
Description: Identify missing values in the dataset.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Introduce some missing values
df.loc[0, 'sepal length (cm)'] = np.nan
df.loc[10, 'sepal width (cm)'] = np.nan

# Check for missing values
missing_values = df.isnull().sum()

# Print the count of missing values for each column
print("Count of missing values for each column:")
print(missing_values)

# Print the total count of missing values
total_missing_values = df.isnull().sum().sum()
print(f"\nTotal count of missing values: {total_missing_values}")

# Print the percentage of missing values for each column
missing_values_percentage = (df.isnull().sum() / len(df)) * 100
print("\nPercentage of missing values for each column:")
print(missing_values_percentage)


Question 3: Descriptive Statistics
<br>
Description: Calculate descriptive statistics for numerical columns.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Calculate descriptive statistics for numerical columns
descriptive_stats = df.describe()

# Print the descriptive statistics
print(descriptive_stats)


Question 4: Handling Outliers
<br>
Description: Identify outliers in numerical columns using box plots.

In [None]:

# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Box plot for each numerical column
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
axs[0, 0].boxplot(df['sepal length (cm)'])
axs[0, 0].set_title('Sepal Length')
axs[0, 1].boxplot(df['sepal width (cm)'])
axs[0, 1].set_title('Sepal Width')
axs[1, 0].boxplot(df['petal length (cm)'])
axs[1, 0].set_title('Petal Length')
axs[1, 1].boxplot(df['petal width (cm)'])
axs[1, 1].set_title('Petal Width')
plt.tight_layout()
plt.show()

# Identify outliers using IQR method
Q1 = df['sepal length (cm)'].quantile(0.25)
Q3 = df['sepal length (cm)'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['sepal length (cm)'] < lower_bound) | (df['sepal length (cm)'] > upper_bound)]
print("Outliers in Sepal Length:")
print(outliers)



Question 5: Categorical Data Analysis
<br>
Description: Explore the counts of categorical variables.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Map target values to categorical labels
target_mapping = {0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'}
df['target'] = df['target'].map(target_mapping)

# Count the occurrences of each category
category_counts = df['target'].value_counts()

# Print the counts
print("Counts of categorical variables:")
print(category_counts)

# Plot a bar chart
import matplotlib.pyplot as plt
category_counts.plot(kind='bar')
plt.title('Counts of Categorical Variables')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()



Question 6: Data Transformation
<br>
Description: Transform a categorical column into numerical using Label Encoding.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
species = iris.target_names
df['species'] = [species[i] for i in iris.target]

# Apply Label Encoding
le = LabelEncoder()
df['species_encoded'] = le.fit_transform(df['species'])

# Print the original and encoded values
print("Original values:")
print(df['species'].unique())
print("Encoded values:")
print(df['species_encoded'].unique())

# Print the mapping between original and encoded values
print("\nMapping between original and encoded values:")
for i, species in enumerate(le.classes_):
    print(f"{species}: {i}")



Question 7: Visualizing Data Distributions
<br>
Description: Plot histograms for numerical columns to understand distributions.

In [None]:

# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Plot histograms for numerical columns
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
axs[0, 0].hist(df['sepal length (cm)'], bins=10, edgecolor='black')
axs[0, 0].set_title('Sepal Length Distribution')
axs[0, 1].hist(df['sepal width (cm)'], bins=10, edgecolor='black')
axs[0, 1].set_title('Sepal Width Distribution')
axs[1, 0].hist(df['petal length (cm)'], bins=10, edgecolor='black')
axs[1, 0].set_title('Petal Length Distribution')
axs[1, 1].hist(df['petal width (cm)'], bins=10, edgecolor='black')
axs[1, 1].set_title('Petal Width Distribution')
plt.tight_layout()
plt.show()



Question 8: Correlation Analysis
<br>
Description: Calculate and visualize the correlation matrix for numerical features.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Calculate the correlation matrix
corr_matrix = df.corr()

# Print the correlation matrix
print("Correlation Matrix:")
print(corr_matrix)

# Visualize the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()



Question 9: Feature Engineering
<br>
Description: Create a new feature by combining or transforming existing features.

In [None]:

# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Create a new feature: sepal area
df['sepal_area'] = df['sepal length (cm)'] * df['sepal width (cm)']

# Create a new feature: petal area
df['petal_area'] = df['petal length (cm)'] * df['petal width (cm)']

# Print the updated DataFrame
print("Updated DataFrame:")
print(df.head())

# You can also create a new feature by applying a transformation
# For example, let's create a new feature: log sepal length
import numpy as np
df['log_sepal_length'] = np.log(df['sepal length (cm)'])

# Print the updated DataFrame
print("Updated DataFrame:")
print(df.head())


Question 10: Advanced Outlier Detection
<br>
Description: Use the Z-score method to identify and handle outliers.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Calculate the Z-scores for each feature
from scipy import stats
z_scores = np.abs(stats.zscore(df))

# Identify outliers using a threshold of 3 standard deviations
threshold = 3
outliers = df[(z_scores > threshold).any(axis=1)]

# Print the outliers
print("Outliers:")
print(outliers)

# Handle outliers by removing them
df_cleaned = df[(z_scores <= threshold).all(axis=1)]

# Print the cleaned DataFrame
print("Cleaned DataFrame:")
print(df_cleaned.head())

# Alternatively, you can handle outliers by replacing them with the median or mean
df_handled = df.copy()
for column in df.columns:
    median = df[column].median()
    df_handled[column] = np.where(np.abs(stats.zscore(df[column])) > threshold, median, df[column])

# Print the handled DataFrame
print("Handled DataFrame:")
print(df_handled.head())

