<a href="https://colab.research.google.com/github/shashithenuwara/FDM-MINI_project/blob/main/Support_Vector_Machine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [66]:
df_before = pd.read_csv("Sleep_Data_Sampled.csv")

In [67]:
columnsToDrop = ["Person ID","Sleep Duration","Daily Steps"]
df = df_before.drop(columns=columnsToDrop,axis = 1)

In [68]:
df.head(20)

Unnamed: 0,Gender,Age,Occupation,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Fatigue Score,Sleep Disorder
0,Male,35,Doctor,7,50,7,Under Weight,120/80,71,7.52,Healthy
1,Male,42,Teacher,8,52,4,Normal,135/90,66,7.54,Healthy
2,Male,34,Software Engineer,7,66,6,Overweight,126/83,74,9.5,Healthy
3,Male,32,Doctor,6,52,7,Normal,120/80,71,7.54,Healthy
4,Male,37,Lawyer,7,60,6,Normal,125/80,71,8.76,Healthy
5,Male,33,Doctor,7,50,6,Under Weight,120/80,69,7.25,Healthy
6,Male,44,Engineer,8,45,4,Normal,125/80,65,5.73,Healthy
7,Male,40,Engineer,8,52,4,Normal,120/80,68,6.46,Healthy
8,Male,36,Lawyer,7,60,6,Normal,130/85,71,8.76,Healthy
9,Male,40,Engineer,7,58,6,Under Weight,130/85,70,8.11,Healthy


**1.Data Cleaning**

**Handle Missing Values**(Using Global Constant)

In [69]:
# Replace missing values in numeric columns with 0
df_before.fillna({col: 0 for col in df_before.select_dtypes(include='number').columns}, inplace=True)

# Verify if missing values are handled
df_before.isnull().sum()


Unnamed: 0,0
Person ID,0
Gender,0
Age,0
Occupation,0
Sleep Duration,0
Quality of Sleep,0
Physical Activity Level,0
Stress Level,0
BMI Category,0
Blood Pressure,0


**Handle Noisy Data (Replace minus valuse in Fatigue Score coloumn with mean value)**

In [70]:
# Calculate the mean of valid (non-negative) Fatigue Score values
mean_fatigue_score = df[df['Fatigue Score'] >= 0]['Fatigue Score'].mean()

# Replace negative values in Fatigue Score with the mean value
df['Fatigue Score'] = df['Fatigue Score'].apply(lambda x: mean_fatigue_score if x < 0 else x)

# Verify changes
df['Fatigue Score'].describe()


Unnamed: 0,Fatigue Score
count,14000.0
mean,8.57628
std,2.394376
min,3.53
25%,6.87
50%,8.11
75%,10.07
max,15.25


**Handle Non-Numerical Valus**

In [71]:
def convert_blood_pressure(bp):
    try:

        systolic, diastolic = map(int, bp.split('/'))
        return (systolic + diastolic) / 2  # Example: return the average
    except:
        return None

In [72]:
df['Blood Pressure'] = df['Blood Pressure'].apply(convert_blood_pressure)

In [73]:
df.head(55)

Unnamed: 0,Gender,Age,Occupation,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Fatigue Score,Sleep Disorder
0,Male,35,Doctor,7,50,7,Under Weight,100.0,71,7.52,Healthy
1,Male,42,Teacher,8,52,4,Normal,112.5,66,7.54,Healthy
2,Male,34,Software Engineer,7,66,6,Overweight,104.5,74,9.5,Healthy
3,Male,32,Doctor,6,52,7,Normal,100.0,71,7.54,Healthy
4,Male,37,Lawyer,7,60,6,Normal,102.5,71,8.76,Healthy
5,Male,33,Doctor,7,50,6,Under Weight,100.0,69,7.25,Healthy
6,Male,44,Engineer,8,45,4,Normal,102.5,65,5.73,Healthy
7,Male,40,Engineer,8,52,4,Normal,100.0,68,6.46,Healthy
8,Male,36,Lawyer,7,60,6,Normal,107.5,71,8.76,Healthy
9,Male,40,Engineer,7,58,6,Under Weight,107.5,70,8.11,Healthy


**2.Data Transformation**

In [74]:
# Selecting the numeric columns for normalization
numeric_columns = [ 'Quality of Sleep', 'Physical Activity Level',
                   'Stress Level', 'Heart Rate', 'Fatigue Score']

**Get Numeric Values to 0-10 Scale**

In [75]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 10))



In [76]:
# Apply the scaler to the numeric columns

df_before[numeric_columns] = scaler.fit_transform(df_before[numeric_columns])

df_before[numeric_columns] = df_before[numeric_columns].round(2)


In [77]:
#View the Normalized Data
df.head(55)

Unnamed: 0,Gender,Age,Occupation,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Fatigue Score,Sleep Disorder
0,Male,35,Doctor,7,50,7,Under Weight,100.0,71,7.52,Healthy
1,Male,42,Teacher,8,52,4,Normal,112.5,66,7.54,Healthy
2,Male,34,Software Engineer,7,66,6,Overweight,104.5,74,9.5,Healthy
3,Male,32,Doctor,6,52,7,Normal,100.0,71,7.54,Healthy
4,Male,37,Lawyer,7,60,6,Normal,102.5,71,8.76,Healthy
5,Male,33,Doctor,7,50,6,Under Weight,100.0,69,7.25,Healthy
6,Male,44,Engineer,8,45,4,Normal,102.5,65,5.73,Healthy
7,Male,40,Engineer,8,52,4,Normal,100.0,68,6.46,Healthy
8,Male,36,Lawyer,7,60,6,Normal,107.5,71,8.76,Healthy
9,Male,40,Engineer,7,58,6,Under Weight,107.5,70,8.11,Healthy


**3.Data Mining**

**Encode Categorical Variables using One-Hot Encoding**

In [78]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoder = LabelEncoder()

df['Encoded_Column_Gender'] = label_encoder.fit_transform(df['Gender'])
df['Encoded_Column_Occupation'] = label_encoder.fit_transform(df['Occupation'])
df['Encoded_Column_BMI Category'] = label_encoder.fit_transform(df['BMI Category'])
#df['Encoded_Column_Sleep Disorder'] = label_encoder.fit_transform(df['Sleep Disorder'])


In [79]:
df = pd.get_dummies(df, columns=['Gender'],)
df = pd.get_dummies(df, columns=['Occupation'])
df = pd.get_dummies(df, columns=['BMI Category'])
#df = pd.get_dummies(df, columns=['Sleep Disorder'])

In [80]:
df.head(5)

Unnamed: 0,Age,Quality of Sleep,Physical Activity Level,Stress Level,Blood Pressure,Heart Rate,Fatigue Score,Sleep Disorder,Encoded_Column_Gender,Encoded_Column_Occupation,...,Occupation_Nurse,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,BMI Category_Normal,BMI Category_Obese,BMI Category_Overweight,BMI Category_Under Weight
0,35,7,50,7,100.0,71,7.52,Healthy,1,1,...,False,False,False,False,False,False,False,False,False,True
1,42,8,52,4,112.5,66,7.54,Healthy,1,10,...,False,False,False,False,False,True,True,False,False,False
2,34,7,66,6,104.5,74,9.5,Healthy,1,9,...,False,False,False,False,True,False,False,False,True,False
3,32,6,52,7,100.0,71,7.54,Healthy,1,1,...,False,False,False,False,False,False,True,False,False,False
4,37,7,60,6,102.5,71,8.76,Healthy,1,3,...,False,False,False,False,False,False,True,False,False,False


**Split dataset into Features(X) and Target(y)**

In [81]:
# Define the feature matrix (X) and the target vector (y)
X = df.drop('Sleep Disorder', axis=1)
y = df['Sleep Disorder']


**Train-Test Split**

In [82]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [83]:
#You should handle the missing values in X_train before making predictions
X_train.dropna(inplace=True)
y_train = y_train[X_train.index]

**Support Vector Machine Classifier**

In [84]:
from sklearn.svm import SVC

# Initialize the Decision Tree model
svm = SVC(random_state=42)

# Train the model
svm.fit(X_train, y_train)




In [86]:
#You should handle the missing values in X_test before making predictions
X_test.dropna(inplace=True)
# Remove corresponding rows from y_test
y_test = y_test[X_test.index]

In [87]:
from sklearn.metrics import accuracy_score, f1_score

# Predict on the test set
y_pred = svm.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')

Accuracy: 0.8836657169990503
F1 Score: 0.8830012517102143


In [88]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[1341   40   35]
 [ 158 1133   97]
 [  69   91 1248]]


In [89]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

     Healthy       0.86      0.95      0.90      1416
    Insomnia       0.90      0.82      0.85      1388
 Sleep Apnea       0.90      0.89      0.90      1408

    accuracy                           0.88      4212
   macro avg       0.89      0.88      0.88      4212
weighted avg       0.89      0.88      0.88      4212

