In [None]:
import numpy as np               # For numerical operations
import pandas as pd              # For data manipulation and analysis
import matplotlib.pyplot as plt   # For data visualization
import seaborn as sns             # For advanced visualization

In [None]:
from google.colab import files
uploaded = files.upload()  # This will prompt to upload a file

import pandas as pd
df = pd.read_csv('grip_strength_data.csv')


Saving grip_strength_data.csv to grip_strength_data (1).csv


In [None]:
df.head(10)

Unnamed: 0,Height,Weight,Age,Grip strength,Frailty
0,65.8,112,30,30,N
1,71.5,136,19,31,N
2,69.4,153,45,29,N
3,68.2,142,22,28,Y
4,67.8,144,29,24,Y
5,68.7,123,50,26,N
6,69.8,141,51,22,Y
7,70.1,136,23,20,Y
8,67.9,112,17,19,N
9,66.8,120,39,31,N


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Height         10 non-null     float64
 1   Weight         10 non-null     int64  
 2   Age            10 non-null     int64  
 3   Grip strength  10 non-null     int64  
 4   Frailty        10 non-null     object 
dtypes: float64(1), int64(3), object(1)
memory usage: 528.0+ bytes


In [None]:
df.describe()

Unnamed: 0,Height,Weight,Age,Grip strength
count,10.0,10.0,10.0,10.0
mean,68.6,131.9,32.5,26.0
std,1.670662,14.231811,12.860361,4.521553
min,65.8,112.0,17.0,19.0
25%,67.825,120.75,22.25,22.5
50%,68.45,136.0,29.5,27.0
75%,69.7,141.75,43.5,29.75
max,71.5,153.0,51.0,31.0


**Data Preprocessing**

**1. Checking for Missing Values**

In [None]:
df.isnull().sum()  # Shows the count of missing values for each column

Unnamed: 0,0
Height,0
Weight,0
Age,0
Grip strength,0
Frailty,0


The data does not have any missing values.

**2. Handling Categorical Data**

In [None]:
# Applying one-hot encoding to the 'Frailty' column, getting binary 0s and 1s
df_encoded = pd.get_dummies(df, columns=['Frailty'], drop_first=True)

# Convert any boolean values (True/False) to integers (0/1)
df_encoded = df_encoded.astype(int)

In [None]:
df_encoded.head()

Unnamed: 0,Height,Weight,Age,Grip strength,Frailty_Y
0,65,112,30,30,0
1,71,136,19,31,0
2,69,153,45,29,0
3,68,142,22,28,1
4,67,144,29,24,1


Here I have converted the values of Frailty column from non-numeric data to numeric data.

**3. Standardization**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_encoded), columns=df_encoded.columns)

**4. Checking for All Duplicates**

In [None]:
# Checking for duplicate rows
duplicates = df_scaled.duplicated()
print(duplicates)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool


There are no duplicate values in the data.

**5. Handling Outliers**

In [None]:
from scipy import stats
z_scores = np.abs(stats.zscore(df_scaled.select_dtypes(include=np.number)))
df_cleaned = df_scaled[(z_scores < 3).all(axis=1)]  # Removing rows with z-scores > 3

In [None]:
df_cleaned.head(10)

Unnamed: 0,Height,Weight,Age,Grip strength,Frailty_Y
0,-1.732051,-1.473912,-0.204911,0.932505,-0.816497
1,1.732051,0.30367,-1.10652,1.165631,-0.816497
2,0.57735,1.562791,1.024556,0.699379,-0.816497
3,0.0,0.748066,-0.860627,0.466252,1.224745
4,-0.57735,0.896198,-0.286876,-0.466252,1.224745
5,0.0,-0.659187,1.434378,0.0,-0.816497
6,0.57735,0.674,1.516343,-0.932505,1.224745
7,1.154701,0.30367,-0.778662,-1.398757,1.224745
8,-0.57735,-1.473912,-1.270449,-1.631883,-0.816497
9,-1.154701,-0.881385,0.532769,1.165631,-0.816497
