In [None]:
import pandas as pd

df = pd.read_csv("assets/Iris.csv")

In [2]:
df.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [None]:
X = df.drop(columns=["Id", "Species"])
y = df["Species"]

Stratification is used to make sure that all split has equal distribution of data


In [None]:
from sklearn.model_selection import train_test_split

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123, shuffle=True, stratify=y
)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=123, shuffle=True, stratify=y_temp
)

In [6]:
y_temp

99     Iris-versicolor
35         Iris-setosa
87     Iris-versicolor
5          Iris-setosa
10         Iris-setosa
            ...       
12         Iris-setosa
142     Iris-virginica
120     Iris-virginica
20         Iris-setosa
61     Iris-versicolor
Name: Species, Length: 120, dtype: object

In [None]:
import numpy as np

str_list = y_temp
unique_strings, inverse = np.unique(str_list, return_inverse=True)
counts = np.bincount(inverse).tolist()
result = dict(zip(unique_strings, counts))

print(result)

{'Iris-setosa': 40, 'Iris-versicolor': 40, 'Iris-virginica': 40}


In [8]:
counts

[40, 40, 40]

### Normalization


##### Min - Max Scaling

The values will be between 0 and 1

$$
x_{norm}^{[i]} = \frac{x^{[i]} - x_{min}}{x_{max} - x_{min}}
$$


In [30]:
x = np.arange(6).astype(float)
x

array([0., 1., 2., 3., 4., 5.])

In [31]:
x_norm = (x - x.min()) / (x.max() - x.min())
x_norm

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

##### Standardization

$$
x_{std}^{[i]} = \frac{x^{[i]} - \mu_{x}}{\sigma_{x}}\\
$$

$\text{where,}\\$

- $\mu_{x} \rightarrow \text{feature mean}\\$
- $\sigma_{x} \rightarrow \text{standard deviation}\\$


In [32]:
x = np.arange(6).astype(float)
x

array([0., 1., 2., 3., 4., 5.])

In [33]:
x_std = (x - x.mean()) / x.std()
x_std

array([-1.46385011, -0.87831007, -0.29277002,  0.29277002,  0.87831007,
        1.46385011])

$\text{Population Standard Deviation }(\sigma) = \sqrt{\frac{\sum | x_{[i]}-\mu |^2}{\text{N}}}\\$

- We have the entire data

$\text{where,}\\$

- $x_{[i]} \rightarrow \text{single data point}\\$
- $\mu \rightarrow \text{mean of all the values}\\$
- $\text{N} \rightarrow \text{Total number of values}\\$

$\text{Sample Standard Deviation }(S) = \sqrt{\frac{\sum | x-\bar{x} |^2}{\text{N-1}}}\\$

- We have sample data out of entire data

$\text{where,}\\$

- $x_{[i]} \rightarrow \text{single data point}\\$
- $\bar{x} \rightarrow \text{mean of all the values}\\$
- $\text{N} \rightarrow \text{Total number of values}\\$


Standard deviation tells us how far the numbers are related to each other, the more spreadout, the greater the standard deviation.

4,5,6 VS 3,5,7 --> 3,5,7 has greater standard deviation


In [None]:
print(x.std(ddof=0))  # Population standard deviation
# implementation
a = 0
for i in x:
    a += (i - x.mean()) ** 2
a = a / (len(x))
print(a**0.5)

1.707825127659933
1.707825127659933


In [None]:
print(x.std(ddof=1))  # Sample standard deviation
# implementation
a = 0
for i in x:
    a += (i - x.mean()) ** 2
a = a / (len(x) - 1)
print(a**0.5)

1.8708286933869707
1.8708286933869707


Scaling Validation and Test Sets


In [13]:
mu, sigma = X_train.mean(axis=0), X_train.std(axis=0)

We didn't calculate the mu and sigma for test and validation set, because they are unknown


In [17]:
X_train_std = (X_train - mu) / sigma
X_valid_std = (X_valid - mu) / sigma
X_test_std = (X_test - mu) / sigma

Given three training examples:

- example 1: 10 cm -> Class 2
- example 2: 20 cm -> Class 2
- example 3: 30 cm -> Class 1

Estimate:

- Mean: 20 cm
- SD: 8.2 cm

Standardize:

- example 1: -1.21 -> Class 2 $(\frac{10-20}{8.2} \rightarrow -1.2)$
- example 2: 0.00 -> Class 2
- example 3: 1.21 -> Class 1


If we have classification rule:

$$
h(z) = \left\{
\begin{array}{}
\text{class 2} & \text{if } z \le 0.6 \\
\text{class 1} & Otherwise \\
\end{array}
\right.
$$

Given three training examples:

- example 1: 10 cm -> Class 2
- example 2: 20 cm -> Class 2
- example 3: 30 cm -> Class 1

Estimate:

- Mean: 20 cm
- SD: 8.2 cm

Standardize (Z Scores):

- example 1: -1.21 -> Class 2 $(<0.6)$
- example 2: 0.00 -> Class 2 $(<0.6)$
- example 3: 1.21 -> Class 1 $(>0.6)$


In [None]:
# SKLearn's standardization and min max scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_valid_std = scaler.transform(
    X_valid
)  # we are using the same scaler fitted on the training set, not a new one for each set
X_test_std = scaler.transform(X_test)

In [19]:
X_train_std

array([[ 1.08131314, -1.32387424,  1.17017883,  0.78318281],
       [ 2.59673011,  1.72445612,  1.51636425,  1.04699176],
       [-1.82323603, -0.38592644, -1.36851423, -1.32728877],
       [ 0.82874365, -0.15143949,  0.82399341,  1.04699176],
       [-0.30781907, -0.15143949,  0.18932015,  0.12366044],
       [ 1.33388264,  0.08304746,  0.93938855,  1.17889624],
       [ 0.7024589 , -0.85490034,  0.88169098,  0.91508729],
       [ 0.57617416,  0.78650831,  1.05478369,  1.57460966],
       [ 1.96530637, -0.62041339,  1.34327154,  0.91508729],
       [-0.18153432,  3.13137782, -1.31081666, -1.06347982],
       [-0.56038856,  0.78650831, -1.19542152, -1.32728877],
       [ 0.7024589 , -0.62041339,  1.05478369,  1.17889624],
       [ 0.32360466, -0.62041339,  0.53550557, -0.00824403],
       [-0.30781907, -0.15143949,  0.42011043,  0.38746939],
       [-0.30781907, -1.32387424,  0.07392501, -0.1401485 ],
       [-0.05524958, -0.85490034,  0.18932015, -0.27205298],
       [ 0.44988941, -0.

Working with Categorical data


In [3]:
import pandas as pd
data = {
    "color": ["green", "red", "blue"],
    "size": ["M", "L", "XXL"],
    "price": [10.1, 13.5, 15.3],
    "classlabel": ["class1", "class2", "class1"],
}

df = pd.DataFrame(data)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XXL,15.3,class1


Here color and size are categorical and price is continuous variables

- M < L < XXL --> ordinal variable, computer can't understand M is smaller than L, so we have to give some guidance


In [4]:
mapping_dict = {
    "M": 2,
    "L": 3,
    "XXL": 5,
}
df["size"] = df["size"].map(mapping_dict)
df

Unnamed: 0,color,size,price,classlabel
0,green,2,10.1,class1
1,red,3,13.5,class2
2,blue,5,15.3,class1


Nominal data involves categories with no inherent order or ranking, while ordinal data involves categories with a meaningful order or ranking

Nominal Data (Class Labels)

for nominal data, we can use label endoders

In [5]:
from sklearn.preprocessing import LabelEncoder

le= LabelEncoder()
df["classlabel"] = le.fit_transform(df["classlabel"])
df

Unnamed: 0,color,size,price,classlabel
0,green,2,10.1,0
1,red,3,13.5,1
2,blue,5,15.3,0


In color column, we cant say one color is greater than the other, ex: (green< red< blue). In some applications there will be some colors are more impertant, but in general, they all carry same weights.

If we use label encoder it may alphabetically assign numbers based on letters. ex: blue = 0, green =1, red = 2, hence (blue < green < red)

To avoid this issue, we use a technique called one hot encoding, how it works is, that it creates a new feature column for each class and uses the values 0s and 1s to determine whether the color is present or not.

In [6]:
pd.get_dummies(df)

Unnamed: 0,size,price,classlabel,color_blue,color_green,color_red
0,2,10.1,0,False,True,False
1,3,13.5,1,False,False,True
2,5,15.3,0,True,False,False


We can assume if the color is not blue or green, it must be red so one column is unnecessary so we can drop one column

In [7]:
pd.get_dummies(df, drop_first=True)

Unnamed: 0,size,price,classlabel,color_green,color_red
0,2,10.1,0,True,False
1,3,13.5,1,False,True
2,5,15.3,0,False,False


Dealing with Missing Data

In [9]:
x = [
    {"A": 1, "B": 2, "C": 3, 'D': 4},
    {"A": 5, "B": 6, "C": None, 'D': 8},
    {"A": 10, "B": 11, "C": 12, 'D': None},
]
df2 = pd.DataFrame(x).astype(float)
df2

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


Naive Ways
- If we do this method, if sometimes the column is required for prediction but we omitted already from training, it may cause wrong prediction
- better to remove rows
- but if we have millions of columns, we can remove one column for example if an image containing one pixel is damaged, we can remove them

In [10]:
df2.dropna(axis=0, how="any")  # drop rows with any NaN values

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [11]:
df2.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


Assigning the mean of columns


In [12]:
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan ,strategy="mean")
X = df2.values
X = imputer.fit_transform(X)
X

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

Alternate way (predicting data):
- We can actually predict the missing values by considering all the other columns as features and the column with Nan as target
- We can use a regression model