<a href="https://colab.research.google.com/github/shashithenuwara/FDM-MINI_project/blob/main/FDM_MINI_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

**Read the Dataset**

In [None]:
df = pd.read_csv("Sleep_Data_Sampled.csv")

In [None]:
df.head(20)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Fatigue Score,Sleep Disorder
0,1,Male,35,Doctor,6.65,7,50,7,Under Weight,120/80,71,7100,7.52,Healthy
1,2,Male,42,Teacher,6.9,8,52,4,Normal,135/90,66,7000,7.54,Healthy
2,3,Male,34,Software Engineer,6.95,7,66,6,Overweight,126/83,74,6100,9.5,Healthy
3,4,Male,32,Doctor,6.9,6,52,7,Normal,120/80,71,6500,7.54,Healthy
4,5,Male,37,Lawyer,6.85,7,60,6,Normal,125/80,71,6500,8.76,Healthy
5,6,Male,33,Doctor,6.9,7,50,6,Under Weight,120/80,69,7500,7.25,Healthy
6,7,Male,44,Engineer,7.85,8,45,4,Normal,125/80,65,5000,5.73,Healthy
7,8,Male,40,Engineer,8.05,8,52,4,Normal,120/80,68,6500,6.46,Healthy
8,9,Male,36,Lawyer,6.85,7,60,6,Normal,130/85,71,6500,8.76,Healthy
9,10,Male,40,Engineer,7.15,7,58,6,Under Weight,130/85,70,6500,8.11,Healthy


**1.Data Cleaning**

**Handle Missing Values**(Using Global Constant)

In [None]:
# Replace missing values in numeric columns with 0
df.fillna({col: 0 for col in df.select_dtypes(include='number').columns}, inplace=True)

# Verify if missing values are handled
df.isnull().sum()


Unnamed: 0,0
Person ID,0
Gender,0
Age,0
Occupation,0
Sleep Duration,0
Quality of Sleep,0
Physical Activity Level,0
Stress Level,0
BMI Category,0
Blood Pressure,0


In [None]:
df.head(20)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Fatigue Score,Sleep Disorder
0,1,Male,35,Doctor,6.65,7,50,7,Under Weight,120/80,71,7100,7.52,Healthy
1,2,Male,42,Teacher,6.9,8,52,4,Normal,135/90,66,7000,7.54,Healthy
2,3,Male,34,Software Engineer,6.95,7,66,6,Overweight,126/83,74,6100,9.5,Healthy
3,4,Male,32,Doctor,6.9,6,52,7,Normal,120/80,71,6500,7.54,Healthy
4,5,Male,37,Lawyer,6.85,7,60,6,Normal,125/80,71,6500,8.76,Healthy
5,6,Male,33,Doctor,6.9,7,50,6,Under Weight,120/80,69,7500,7.25,Healthy
6,7,Male,44,Engineer,7.85,8,45,4,Normal,125/80,65,5000,5.73,Healthy
7,8,Male,40,Engineer,8.05,8,52,4,Normal,120/80,68,6500,6.46,Healthy
8,9,Male,36,Lawyer,6.85,7,60,6,Normal,130/85,71,6500,8.76,Healthy
9,10,Male,40,Engineer,7.15,7,58,6,Under Weight,130/85,70,6500,8.11,Healthy


**Handle Noisy Data**
(Replace minus valuse in Fatigue Score coloumn with mean value)

In [None]:
# Calculate the mean of valid (non-negative) Fatigue Score values
mean_fatigue_score = df[df['Fatigue Score'] >= 0]['Fatigue Score'].mean()

# Replace negative values in Fatigue Score with the mean value
df['Fatigue Score'] = df['Fatigue Score'].apply(lambda x: mean_fatigue_score if x < 0 else x)

# Verify changes
df['Fatigue Score'].describe()


Unnamed: 0,Fatigue Score
count,15000.0
mean,8.004071
std,3.150786
min,0.0
25%,6.82
50%,7.73
75%,9.93
max,15.25


**Handle Non-Numerical Valus**

In [None]:
def convert_blood_pressure(bp):
    try:

        systolic, diastolic = map(int, bp.split('/'))
        return (systolic + diastolic) / 2  # Example: return the average
    except:
        return None

In [None]:
df['Blood Pressure'] = df['Blood Pressure'].apply(convert_blood_pressure)

In [None]:
df.head(55)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Fatigue Score,Sleep Disorder
0,1,Male,35,Doctor,6.65,7,50,7,Under Weight,100.0,71,7100,7.52,Healthy
1,2,Male,42,Teacher,6.9,8,52,4,Normal,112.5,66,7000,7.54,Healthy
2,3,Male,34,Software Engineer,6.95,7,66,6,Overweight,104.5,74,6100,9.5,Healthy
3,4,Male,32,Doctor,6.9,6,52,7,Normal,100.0,71,6500,7.54,Healthy
4,5,Male,37,Lawyer,6.85,7,60,6,Normal,102.5,71,6500,8.76,Healthy
5,6,Male,33,Doctor,6.9,7,50,6,Under Weight,100.0,69,7500,7.25,Healthy
6,7,Male,44,Engineer,7.85,8,45,4,Normal,102.5,65,5000,5.73,Healthy
7,8,Male,40,Engineer,8.05,8,52,4,Normal,100.0,68,6500,6.46,Healthy
8,9,Male,36,Lawyer,6.85,7,60,6,Normal,107.5,71,6500,8.76,Healthy
9,10,Male,40,Engineer,7.15,7,58,6,Under Weight,107.5,70,6500,8.11,Healthy


**2.Data Transformation**

In [None]:
# Selecting the numeric columns for normalization
numeric_columns = ['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level',
                   'Stress Level', 'Heart Rate', 'Daily Steps', 'Fatigue Score']

**Get Numeric Values to 0-10 Scale**

In [None]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 10))



In [None]:
# Apply the scaler to the numeric columns

df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

df[numeric_columns] = df[numeric_columns].round(2)


In [None]:
#View the Normalized Data
df.head(55)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Fatigue Score,Sleep Disorder
0,1,Male,35,Doctor,3.15,6.0,3.33,8.0,Under Weight,100.0,2.86,5.86,4.93,Healthy
1,2,Male,42,Teacher,4.07,8.0,3.67,2.0,Normal,112.5,0.48,5.71,4.94,Healthy
2,3,Male,34,Software Engineer,4.26,6.0,6.0,6.0,Overweight,104.5,4.29,4.43,6.23,Healthy
3,4,Male,32,Doctor,4.07,4.0,3.67,8.0,Normal,100.0,2.86,5.0,4.94,Healthy
4,5,Male,37,Lawyer,3.89,6.0,5.0,6.0,Normal,102.5,2.86,5.0,5.74,Healthy
5,6,Male,33,Doctor,4.07,6.0,3.33,6.0,Under Weight,100.0,1.9,6.43,4.75,Healthy
6,7,Male,44,Engineer,7.59,8.0,2.5,2.0,Normal,102.5,0.0,2.86,3.76,Healthy
7,8,Male,40,Engineer,8.33,8.0,3.67,2.0,Normal,100.0,1.43,5.0,4.24,Healthy
8,9,Male,36,Lawyer,3.89,6.0,5.0,6.0,Normal,107.5,2.86,5.0,5.74,Healthy
9,10,Male,40,Engineer,5.0,6.0,4.67,6.0,Under Weight,107.5,2.38,5.0,5.32,Healthy


**3.Data Mining**

**Encode Categorical Variables using One-Hot Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoder = LabelEncoder()

df['Encoded_Column_Gender'] = label_encoder.fit_transform(df['Gender'])
df['Encoded_Column_Occupation'] = label_encoder.fit_transform(df['Occupation'])
df['Encoded_Column_BMI Category'] = label_encoder.fit_transform(df['BMI Category'])
#df['Encoded_Column_Sleep Disorder'] = label_encoder.fit_transform(df['Sleep Disorder'])


In [None]:
df = pd.get_dummies(df, columns=['Gender'],)
df = pd.get_dummies(df, columns=['Occupation'])
df = pd.get_dummies(df, columns=['BMI Category'])
#df = pd.get_dummies(df, columns=['Sleep Disorder'])

In [None]:
df.head(5)

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Blood Pressure,Heart Rate,Daily Steps,Fatigue Score,...,Occupation_Nurse,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,BMI Category_Normal,BMI Category_Obese,BMI Category_Overweight,BMI Category_Under Weight
0,1,35,3.15,6.0,3.33,8.0,100.0,2.86,5.86,4.93,...,False,False,False,False,False,False,False,False,False,True
1,2,42,4.07,8.0,3.67,2.0,112.5,0.48,5.71,4.94,...,False,False,False,False,False,True,True,False,False,False
2,3,34,4.26,6.0,6.0,6.0,104.5,4.29,4.43,6.23,...,False,False,False,False,True,False,False,False,True,False
3,4,32,4.07,4.0,3.67,8.0,100.0,2.86,5.0,4.94,...,False,False,False,False,False,False,True,False,False,False
4,5,37,3.89,6.0,5.0,6.0,102.5,2.86,5.0,5.74,...,False,False,False,False,False,False,True,False,False,False


**Split dataset into Features(X) and Target(y)**

In [None]:
# Define the feature matrix (X) and the target vector (y)
X = df.drop('Sleep Disorder', axis=1)
y = df['Sleep Disorder']


**Train-Test Split**

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Train Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree model
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)




**4.Evaluation**

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')

Accuracy: 0.9973333333333333
F1 Score: 0.997332879705269


In [None]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[1022    4    0]
 [   1  993    3]
 [   0    0  977]]


In [None]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

     Healthy       1.00      1.00      1.00      1026
    Insomnia       1.00      1.00      1.00       997
 Sleep Apnea       1.00      1.00      1.00       977

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

