<a href="https://www.kaggle.com/code/satasdasdas/the-elites?scriptVersionId=197348803" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
#DATA ANALYTICS PROJECT by "TEAM ELITES"
# TEAM  LEAD: SYED RUKSANA (22WJ1A05W4) @https://github.com/SyedRuksana2712
# TEAM MEMBER: SATYAM MAYENGBAM (22WJ1A05U5) @https://github.com/satyammayy
# TEAM MEMBER: SURAJ RANJAN (22WJ1A05W2) @https://github.com/MADDY8795

In [None]:
#Importing required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
import warnings #supress warnings of EOL of some methods

In [None]:
#Reading the dataset.
file_path = '/kaggle/input/health/health_dataset.csv'
data = pd.read_csv(file_path)

In [None]:
#displaying some data
print(data.head())

In [None]:
#checking if there are any anomalies like unfilled fields
print("\nMissing Values:")
print(data.isnull().sum())

In [None]:
#Using data.describe function to describe my dataset/ insights
#print("\nSummary Statistics:")
#print(data.describe())

In [None]:
#DATA analysis of the dataset starts here
warnings.filterwarnings("ignore", category=FutureWarning)
#How many people sleep for the said no. of hours
plt.figure(figsize=(8,6))
sns.histplot(data['Sleep Duration'], kde=True, bins=20, color='skyblue') #kde is for estiamted probability density, bins for the no.of data grouped
plt.title('Distribution of Sleep Duration')
#Bins: The x-axis of the histogram is divided into intervals, called bins. Each bin represents a set of values. For example, if your bin is 1 hour wide, the bin can cover 6 to 7 hours of sleep.
plt.xlabel('Hours of Sleep')
plt.ylabel('Frequency')
#Frequency: The y-axis of the histogram represents the frequency, calculated from data points (hours of sleep in this case) falling into each bin for, if 50 people slept for 6 to 7 hours, frequency the number of those bins will be 50
plt.show()

In [None]:
#in correlation data we cannot have non numeric data so we select only numeric data
numeric_data = data.select_dtypes(include='number').drop(columns=['Person ID'])

#Correlation Heatmap
plt.figure(figsize=(10,8))
sns.heatmap(numeric_data.corr(), cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#Relationship between Sleep Duration and Stress Level
plt.figure(figsize=(8,6))
sns.scatterplot(x='Sleep Duration', y='Stress Level', data=data, hue='Gender')
plt.title('Sleep Duration vs Stress Level')
plt.xlabel('Sleep Duration (hours)')
plt.ylabel('Stress Level')
plt.show()

In [None]:
# Distribution of BMI Category
plt.figure(figsize=(8,6))
sns.countplot(x='BMI Category', data=data, palette='Set2')
plt.title('Distribution of BMI Category')
plt.xlabel('BMI Category')
plt.ylabel('Count')
plt.show()

In [None]:
#Boxplot of Sleep Duration with BMI Category
plt.figure(figsize=(8,6))
sns.boxplot(x='BMI Category', y='Sleep Duration', data=data, palette='Set1')
plt.title('Sleep Duration by BMI Category')
plt.xlabel('BMI Category')
plt.ylabel('Sleep Duration (hours)')
plt.show()

In [None]:
#Physical Activity vs Sleep Duration
plt.figure(figsize=(8,6))
sns.scatterplot(x='Physical Activity Level', y='Sleep Duration', data=data, hue='BMI Category')
plt.title('Physical Activity vs Sleep Duration')
plt.xlabel('Physical Activity Level')
plt.ylabel('Sleep Duration (hours)')
plt.show()

In [None]:
# Droping rows so that we can target it in y
data = data.dropna(subset=['Sleep Disorder'])

#Setting the constraints for X and Y , X will be used for predicting the target variable Y
X = data[['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'BMI Category']]
y = data['Sleep Disorder']

#Setting numeric and categorical columns
numeric_features = ['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level']
categorical_features = ['BMI Category']

# Create transformers for numeric and categorical data
numeric_transformer = SimpleImputer(strategy='mean') #for missing numerical values we calcualate the mean of the column and place it
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),#for missing categorical values, we reaplace with most frequent category
    ('onehot', OneHotEncoder(handle_unknown='ignore'))#converting categorical values into separate bianry column
])

#Using Column Transformer to set the preprocessing steps for the different transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#applying preprocessing features to the X features
X_preprocessed = preprocessor.fit_transform(X)
# Spliting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42) #here test sie is 0.2 which means 20% of data is reserved for testing  purposes and x train and y train are used to train our model and then we use x test and y test for testing the model accuracy
#Initializes a Random Forest classifier, which is an ensemble learning method that uses multiple decision trees to make predictions
clf = RandomForestClassifier() 
clf.fit(X_train, y_train)
# after training using it to predict the test sets
y_pred = clf.predict(X_test)
# Evaluating the model (accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

In [None]:
#Creating a data frame to show the actual data and predicted data
predictions_df = pd.DataFrame({
    'Actual Data': y_test,
    'Predicted Data': y_pred,
})
# Displaying the DataFrame with both actual and predicted sleep disorders
print(predictions_df)