In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

## head

In [4]:
hrdata=pd.read_csv('https://raw.githubusercontent.com/tkseneee/Dataset/dd7313e0c6487acd9ed7cd32b786782c7a1d3885/HR_data.csv')
hrdata.head(2)

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Gender,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


## Check if there is any missing value.

In [6]:
hrdata.isnull().sum()

Age                  0
Workclass         2079
fnlwgt               0
Education            0
Education_Num        0
Martial_Status       0
Occupation        2087
Relationship         0
Race                 0
Gender               0
Capital_Gain         0
Capital_Loss         0
Hours_per_week       0
Country            656
Target               0
dtype: int64

## Keep only the Numeric values in one variable

In [11]:
numeric_data = hrdata.select_dtypes(include=[np.number])
numeric_data.head()


Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


## Compute the Range and comment about the need of scaling for this dataset

In [12]:
range_data = numeric_data.max() - numeric_data.min()
range_data



Age                    73
fnlwgt            1472420
Education_Num          15
Capital_Gain        99999
Capital_Loss         4356
Hours_per_week         98
dtype: int64

## Perform standard scaling on the 'Area' column without using sklearn function

In [16]:
import pandas as pd
import numpy as np

# Generate a larger sample DataFrame with an 'Area' column for demonstration
np.random.seed(0)
area_data = np.random.randn(955) * 100 + 1000  # Random data with mean ~1000 and std ~100
df = pd.DataFrame({'Area': area_data})

# Calculate the mean and standard deviation of the 'Area' column
mean_area = df['Area'].mean()
std_area = df['Area'].std()

# Perform standard scaling
df['Area_scaled'] = (df['Area'] - mean_area) / std_area

print("Original 'Area' column:")
print(df['Area'].head())
print("\nStandard scaled 'Area' column:")
print(df['Area_scaled'].head())


Original 'Area' column:
0    1176.405235
1    1040.015721
2    1097.873798
3    1224.089320
4    1186.755799
Name: Area, dtype: float64

Standard scaled 'Area' column:
0    1.831160
1    0.459060
2    1.041121
3    2.310870
4    1.935289
Name: Area_scaled, dtype: float64


## What is the min,max, mean and standard deviation of standard scaled data. Check these value for the scaled area column computed in the previous question

In [17]:
import pandas as pd
import numpy as np


np.random.seed(0)
area_data = np.random.randn(955) * 100 + 1000  
df = pd.DataFrame({'Area': area_data})


mean_area = df['Area'].mean()
std_area = df['Area'].std()


df['Area_scaled'] = (df['Area'] - mean_area) / std_area


min_value = df['Area_scaled'].min()
max_value = df['Area_scaled'].max()
mean_value = df['Area_scaled'].mean()
std_value = df['Area_scaled'].std()

print("Statistics of the standard scaled 'Area' column:")
print(f"Min: {min_value}")
print(f"Max: {max_value}")
print(f"Mean: {mean_value}")
print(f"Standard Deviation: {std_value}")

Statistics of the standard scaled 'Area' column:
Min: -3.0079743018827694
Max: 2.8324509732738603
Mean: 6.566010097468989e-16
Standard Deviation: 0.9999999999999994


## Apply Standard Scaler to all the column of the data using sklearn function

In [18]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the numeric data
scaled_data = scaler.fit_transform(numeric_data)

# Convert the scaled data to a DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=numeric_data.columns)

# Display the first few rows of the scaled data
scaled_df.head()


Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week,Age_scaled
0,0.02905,-1.062152,1.134136,0.149031,-0.216957,-0.034641,0.02905
1,0.834363,-1.007377,1.134136,-0.146177,-0.216957,-2.218354,0.834363
2,-0.04416,0.243483,-0.420668,-0.146177,-0.216957,-0.034641,-0.04416
3,1.053994,0.423784,-1.19807,-0.146177,-0.216957,-0.034641,1.053994
4,-0.776263,1.403866,1.134136,-0.146177,-0.216957,-0.034641,-0.776263


## Check whether all the scaled data mean and standard deviation is 0 and 1 respectively

In [19]:
scaled_desc = scaled_df.describe()

# Check mean and standard deviation
mean_std_check = scaled_desc.loc[['mean', 'std']]
mean_std_check



Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week,Age_scaled
mean,4.5518930000000006e-17,1.279977e-16,-2.3732090000000002e-17,-6.370705e-17,-6.234537e-17,8.996797e-17,2.548282e-17
std,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014


## Inverse the scaled data to the original form

In [20]:
# Inverse transform the scaled data
inverse_data = scaler.inverse_transform(scaled_data)

# Convert to DataFrame
inverse_df = pd.DataFrame(inverse_data, columns=numeric_data.columns)

# Display the first few rows of the inverse transformed data
inverse_df.head()


Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week,Age_scaled
0,39.0,77516.0,13.0,2174.0,0.0,40.0,0.02905
1,50.0,83311.0,13.0,0.0,0.0,13.0,0.834352
2,38.0,215646.0,9.0,0.0,0.0,40.0,-0.04416
3,53.0,234721.0,7.0,0.0,0.0,40.0,1.05398
4,28.0,338409.0,13.0,0.0,0.0,40.0,-0.776253


## Apply Min-max scaling to the original numeric data and print its max and minimum values for all the columns

In [21]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
min_max_scaler = MinMaxScaler()

# Fit and transform the numeric data
min_max_scaled_data = min_max_scaler.fit_transform(numeric_data)

# Convert to DataFrame
min_max_scaled_df = pd.DataFrame(min_max_scaled_data, columns=numeric_data.columns)

# Calculate descriptive statistics for the Min-Max scaled data
min_max_desc = min_max_scaled_df.describe()

# Display min and max values
min_max_min_max = min_max_desc.loc[['min', 'max']]
min_max_min_max


Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week,Age_scaled
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Apply Robust Scaler to scale the data

In [22]:
from sklearn.preprocessing import RobustScaler

# Initialize RobustScaler
robust_scaler = RobustScaler()

# Fit and transform the numeric data
robust_scaled_data = robust_scaler.fit_transform(numeric_data)

# Convert to DataFrame
robust_scaled_df = pd.DataFrame(robust_scaled_data, columns=numeric_data.columns)

# Display the first few rows of the robust scaled data
robust_scaled_df.head()


Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week,Age_scaled
0,0.1,-0.841482,1.0,2174.0,0.0,0.0,0.1
1,0.65,-0.793073,1.0,0.0,0.0,-5.4,0.65
2,0.05,0.312399,-0.333333,0.0,0.0,0.0,0.05
3,0.8,0.471744,-1.0,0.0,0.0,0.0,0.8
4,-0.45,1.337911,1.0,0.0,0.0,0.0,-0.45
