In [1]:
pip install pandas numpy scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
# 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 2. Load the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"  # Example URL to dataset
df = pd.read_csv(url, header=None)
df.columns = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']

# 3. Data Preprocessing: Missing values and summary statistics
missing_values = df.isnull().sum()  # Check missing values
statistics = df.describe()  # Summary statistics
variable_info = df.dtypes  # Data types of columns
data_shape = df.shape  # Shape of the data

print("Missing Values:\n", missing_values)
print("\nSummary Statistics:\n", statistics)
print("\nVariable Information:\n", variable_info)
print("\nShape of the DataFrame:", data_shape)

# 4. Data Formatting and Normalization
df['Species'] = df['Species'].astype('category')  # Convert 'Species' to categorical

scaler = MinMaxScaler()
numerical_columns = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

print("\nNormalized Data:\n", df.head())

# 5. Convert categorical variables to quantitative
df['Species'] = df['Species'].cat.codes  # Label encoding
print("\nData with Categorical Variable Converted:\n", df.head())





Missing Values:
 SepalLength    0
SepalWidth     0
PetalLength    0
PetalWidth     0
Species        0
dtype: int64

Summary Statistics:
        SepalLength  SepalWidth  PetalLength  PetalWidth
count   150.000000  150.000000   150.000000  150.000000
mean      5.843333    3.054000     3.758667    1.198667
std       0.828066    0.433594     1.764420    0.763161
min       4.300000    2.000000     1.000000    0.100000
25%       5.100000    2.800000     1.600000    0.300000
50%       5.800000    3.000000     4.350000    1.300000
75%       6.400000    3.300000     5.100000    1.800000
max       7.900000    4.400000     6.900000    2.500000

Variable Information:
 SepalLength    float64
SepalWidth     float64
PetalLength    float64
PetalWidth     float64
Species         object
dtype: object

Shape of the DataFrame: (150, 5)

Normalized Data:
    SepalLength  SepalWidth  PetalLength  PetalWidth      Species
0     0.222222    0.625000     0.067797    0.041667  Iris-setosa
1     0.166667    0.416