In [14]:
# 1. Import all the required Python Libraries
import pandas as pd
import numpy as np

In [15]:
# 2. Locate an open source data from the web.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

In [16]:
# 3. Load the Dataset into pandas data frame
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_df = pd.read_csv(url, names=column_names)

In [17]:
# Display the first few rows of the dataset to verify the import
print("First few rows of the Iris dataset:")
print(iris_df.head())

First few rows of the Iris dataset:
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [18]:
# 4. Data Preprocessing:
# Check for missing values using pandas info(), describe() functions.
print("\nInformation about the dataset:")
print(iris_df.info())


Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


In [24]:
print("\nDescriptive statistics of the dataset:")
print(iris_df.describe())


Descriptive statistics of the dataset:
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [27]:
print(iris_df.isnull())

     sepal_length  sepal_width  petal_length  petal_width  \
0           False        False         False        False   
1           False        False         False        False   
2           False        False         False        False   
3           False        False         False        False   
4           False        False         False        False   
..            ...          ...           ...          ...   
145         False        False         False        False   
146         False        False         False        False   
147         False        False         False        False   
148         False        False         False        False   
149         False        False         False        False   

     class_Iris-versicolor  class_Iris-virginica  
0                    False                 False  
1                    False                 False  
2                    False                 False  
3                    False                 False  
4           

In [20]:
# Variable Descriptions:
# - Sepal Length, Sepal Width, Petal Length, Petal Width: Numeric variables.
# - Class: Categorical variable representing the species of iris flowers.

# Check the dimensions of the data frame.
print("\nDimensions of the dataset (rows, columns):", iris_df.shape)


Dimensions of the dataset (rows, columns): (150, 5)


In [21]:
# 5. Data Formatting and Normalization:
# Summarize the types of variables by checking data types.
print("\nData Types of Variables:")
print(iris_df.dtypes)


Data Types of Variables:
sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
class            object
dtype: object


In [22]:
# 6. Turn categorical variables into quantitative variables.
# The 'class' variable is categorical; we can use one-hot encoding to convert it to quantitative.
iris_df = pd.get_dummies(iris_df, columns=['class'], drop_first=True)

In [23]:
# Display the updated dataframe.
print("\nUpdated DataFrame after one-hot encoding:")
print(iris_df.head())


Updated DataFrame after one-hot encoding:
   sepal_length  sepal_width  petal_length  petal_width  \
0           5.1          3.5           1.4          0.2   
1           4.9          3.0           1.4          0.2   
2           4.7          3.2           1.3          0.2   
3           4.6          3.1           1.5          0.2   
4           5.0          3.6           1.4          0.2   

   class_Iris-versicolor  class_Iris-virginica  
0                  False                 False  
1                  False                 False  
2                  False                 False  
3                  False                 False  
4                  False                 False  
