In [1]:
#Normalization: Normalization is a data preprocessing technique used for transforming numerical features so that they fall within a range (0, 1).


# | Technique           | Range Target     | Use Case                          |
# | ------------------- | ---------------- | --------------------------------- |
# | **Standardization** | Mean = 0, SD = 1 | When data is normally distributed |
# | **Normalization**   | Range \[0, 1]    | When you need bounded values      |

In [7]:
import numpy as np
import pandas as pd

In [9]:
df=pd.read_csv("covid_toy.csv")

In [11]:
df.head(2)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes


In [13]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [19]:
average_fever=df["fever"].mean()
average_fever

100.84444444444445

In [21]:
df["fever"]=df["fever"].fillna(average_fever)

In [23]:
df.isnull().sum()

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

In [21]:
#>>>>>>>another_method_today>>>>>>>
#from sklearn.impute import SimpleImputer   #This line imports the SimpleImputer class from the sklearn.impute module in scikit-learn.

# When your dataset has NaN (Not a Number) or None values, most machine learning algorithms will throw an error.
# SimpleImputer replaces these missing values with a specific value or statistic such as:
# Mean → average of the column
# Median → middle value of the column
# Most Frequent → mode (most common value)
# Constant → a fixed value you choose
####by default it replaces tye value with mean only

In [25]:
from sklearn.impute import SimpleImputer

In [27]:
s1=SimpleImputer(strategy="mean")

In [29]:
df["fever"]=s1.fit_transform(df[["fever"]])

In [31]:
df.isnull().sum()    #no missing values now

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

In [33]:
df.head(2)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes


In [35]:
#now we have to apply normalization for that we have to convert our data into numerical format
#Emcoding: encoding is a technique used to convert categorical data into numerical data because machine only understand numbers

In [37]:
# from sklearn.preprocessing import LabelEncoder: importing the LabelEncoder class from the sklearn.preprocessing module in scikit-learn so you can 
# use it to convert categorical labels (like text) into numerical form for machine learning.
# It’s like saying:
# “Hey scikit-learn, give me the tool that changes text categories into numbers.


In [39]:
# Exactly what LabelEncoder does:
# Finds all unique values in the data.
# Sorts those values (alphabetical for strings, numerical for numbers).
# Assigns an integer index starting from 0 to each sorted value.
# Replaces every original value with its assigned index.

                    
# Sorting is done in ascending order — alphabetical for strings and numerical for numbers.
# For strings: sorting is based on lexicographic (dictionary(the book that contains words and not datatype)) order, not length.
# Example: ["bat", "apple", "car"] → ["apple", "bat", "car"] (even though "bat" and "car" have same length, order is by alphabet).
# For numbers: sorting is by numeric value in ascending order.

In [35]:
from sklearn.preprocessing import LabelEncoder

In [37]:
labelEncoderObject=LabelEncoder()

In [39]:
df["gender"]=labelEncoderObject.fit_transform(df["gender"])
df["cough"]=labelEncoderObject.fit_transform(df["cough"])
df["city"]=labelEncoderObject.fit_transform(df["city"])
df["has_covid"]=labelEncoderObject.fit_transform(df["has_covid"])

In [43]:
df.head(2)   #data is numerical now

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,1,103.0,0,2,0
1,27,1,100.0,0,1,1


In [45]:
#Data is splitted into independent and dependent
x=df.drop("has_covid",axis=1)   # another way of dropping column: x = df.drop(columns=["has_covid"])

y=df["has_covid"]

In [47]:
from sklearn.model_selection import train_test_split

In [49]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [51]:
np.round(x_train.describe(),2)   #minimumn number and maximum number of each input column is different

Unnamed: 0,age,gender,fever,cough,city
count,80.0,80.0,80.0,80.0,80.0
mean,42.91,0.41,100.98,0.4,1.3
std,24.47,0.5,1.93,0.49,1.12
min,5.0,0.0,98.0,0.0,0.0
25%,20.0,0.0,100.0,0.0,0.0
50%,42.0,0.0,101.0,0.0,1.0
75%,65.0,1.0,102.0,1.0,2.0
max,84.0,1.0,104.0,1.0,3.0


In [53]:
#MinMaxScaler will take each existing column and rescale all its values so that:
# The smallest value in that column → 0
# The largest value in that column → 1

from sklearn.preprocessing import MinMaxScaler  


In [59]:
mn=MinMaxScaler()

In [63]:
x_train_mn=mn.fit_transform(x_train)

In [65]:
x_train_new=pd.DataFrame(x_train_mn,columns=x.columns)

In [67]:
np.round(x_train_new.describe(),2)  # now the min is same and max is also same

Unnamed: 0,age,gender,fever,cough,city
count,80.0,80.0,80.0,80.0,80.0
mean,0.48,0.41,0.5,0.4,0.43
std,0.31,0.5,0.32,0.49,0.37
min,0.0,0.0,0.0,0.0,0.0
25%,0.19,0.0,0.33,0.0,0.0
50%,0.47,0.0,0.5,0.0,0.33
75%,0.76,1.0,0.67,1.0,0.67
max,1.0,1.0,1.0,1.0,1.0


In [69]:
import numpy as np
import pandas as pd

In [73]:
df1=pd.read_csv("placement.csv")
df1.head(2)

Unnamed: 0,cgpa,resume_score,placed
0,8.14,6.52,1
1,6.17,5.17,0


In [75]:
df1.isnull().sum()

cgpa            0
resume_score    0
placed          0
dtype: int64

In [77]:
#since it does not have any missing values,i will still fill it using SimpleImputer for best practice

In [85]:
from sklearn.impute import SimpleImputer
s2=SimpleImputer(strategy="mean")
df1["cgpa"]=s2.fit_transform(df1[["cgpa"]])
df1["resume_score"]=s2.fit_transform(df1[["resume_score"]])
df1["placed"]=s2.fit_transform(df1[["placed"]])

In [87]:
df1.isnull().sum()

cgpa            0
resume_score    0
placed          0
dtype: int64

In [91]:
 df1.head(2)

Unnamed: 0,cgpa,resume_score,placed
0,8.14,6.52,1.0
1,6.17,5.17,0.0


In [93]:
#lets perform encoding..
from sklearn.preprocessing import LabelEncoder

In [95]:
leObject=LabelEncoder()

In [103]:
df1["cgpa"]=leObject.fit_transform(df1["cgpa"])
df1["resume_score"]=leObject.fit_transform(df1["resume_score"])
df1["placed"]=leObject.fit_transform(df1["placed"])

In [116]:
df1.shape

(100, 3)

In [113]:
df1.head(2)

Unnamed: 0,cgpa,resume_score,placed
0,70,33,1
1,30,4,0


In [130]:
#Data splitting into dependent and independent
X=df1.drop("placed",axis=1)
Y=df1["placed"]

In [126]:
print(X)  #so X has independent data here

    cgpa  resume_score
0     70            33
1     30             4
2     76            91
3     45            56
4     56            58
..   ...           ...
95    35            28
96    74            73
97    42            74
98    70            11
99    26            36

[100 rows x 2 columns]


In [132]:
print(Y)    #Y has dependent data here

0     1
1     0
2     1
3     1
4     1
     ..
95    0
96    1
97    0
98    1
99    0
Name: placed, Length: 100, dtype: int64


In [134]:
from sklearn.model_selection import train_test_split

In [156]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [140]:
# X → Independent variables (features) → inputs to the model.
# Y → Dependent variable (target) → output we want to predict.
# When we do train_test_split:

# x is split into:
# x_train → features used for training
# x_test → features used for testing

# y is split into:
# y_train → labels corresponding to x_train
# y_test → labels corresponding to x_test

In [158]:
np.round(X_train.describe(),2)  #maximum and minimum value is not same

Unnamed: 0,cgpa,resume_score
count,80.0,80.0
mean,43.31,48.49
std,26.08,27.25
min,0.0,0.0
25%,21.75,24.75
50%,41.5,51.5
75%,67.25,70.75
max,87.0,92.0


In [160]:
from sklearn.preprocessing import MinMaxScaler

In [162]:
Mn=MinMaxScaler()

In [172]:
New_Mn_X=Mn.fit_transform(X_train)

In [174]:
New_Mn_df=pd.DataFrame(New_Mn_X,columns=X.columns)

In [176]:
np.round(New_Mn_df.describe(),2)

Unnamed: 0,cgpa,resume_score
count,80.0,80.0
mean,0.5,0.53
std,0.3,0.3
min,0.0,0.0
25%,0.25,0.27
50%,0.48,0.56
75%,0.77,0.77
max,1.0,1.0
