<a href="https://colab.research.google.com/github/riyanshibohra/Data-Visualization-using-R/blob/main/Another_copy_of_ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder

from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

In [2]:
activity = pd.read_csv("/content/activity_environment_data.csv")
interaction = pd.read_csv("/content/digital_interaction_data.csv")
health = pd.read_csv("/content/personal_health_data.csv")

In [3]:
print(activity.shape)
print(interaction.shape)
print(health.shape)

(10000, 12)
(10000, 4)
(10000, 28)


### **Data Aggregation**

In [4]:
# Merging the datasets on 'User_ID' and 'Timestamp'
combined_data = pd.merge(health, activity, on=['User_ID', 'Timestamp'], how='outer')
combined_data = pd.merge(combined_data, interaction, on=['User_ID', 'Timestamp'], how='outer')

In [5]:
combined_data = combined_data.drop(['User_ID','Timestamp','Notifications_Received','Anomaly_Flag'], axis=1)

In [6]:
combined_data.shape

(10000, 36)

In [7]:
# Function to add random noise to continuous variables
def add_noise(data, columns, noise_level=0.02):
    """
    Add random noise to continuous variables.
    :param data: DataFrame containing the data.
    :param columns: List of columns to add noise to.
    :param noise_level: Percentage of standard deviation as noise.
    """
    for col in columns:
        std = data[col].std()
        noise = np.random.normal(0, std * noise_level, data[col].shape)
        data[col] += noise
    return data

# Function to randomly swap categorical entries
def swap_categorical_entries(data, columns, swap_percentage=0.02):
    """
    Randomly swap entries in categorical columns.
    :param data: DataFrame containing the data.
    :param columns: List of categorical columns to swap.
    :param swap_percentage: Percentage of entries to swap.
    """
    for col in columns:
        n_swap = int(len(data) * swap_percentage)
        indices_to_swap = np.random.choice(data.index, n_swap * 2, replace=False)
        swap_values = data.loc[indices_to_swap, col].values
        np.random.shuffle(swap_values)
        data.loc[indices_to_swap, col] = swap_values
    return data

# Select continuous and categorical columns
numerical_cols = ['Age', 'Weight', 'Height', 'Sleep_Duration', 'Deep_Sleep_Duration',
                      'REM_Sleep_Duration', 'Heart_Rate', 'Blood_Oxygen_Level', 'Calories_Intake',
                      'Water_Intake', 'Skin_Temperature', 'Body_Fat_Percentage', 'Muscle_Mass',
                      'Health_Score', 'Steps', 'Calories_Burned', 'Distance_Covered',
                      'Exercise_Duration', 'Ambient_Temperature', 'Battery_Level', 'Altitude',
                      'UV_Exposure', 'Screen_Time']

categorical_cols = ['Gender', 'Medical_Conditions', 'Medication', 'Smoker', 'Alcohol_Consumption',
                       'Day_of_Week', 'Snoring', 'ECG', 'Stress_Level', 'Exercise_Type',
                       'Exercise_Intensity']

# Apply transformations
data_noisy = add_noise(combined_data.copy(), numerical_cols, noise_level=0.05)
data_noisy_swapped = swap_categorical_entries(data_noisy, categorical_cols, swap_percentage=0.05)

# Show a summary of the transformed data
data_noisy_swapped.describe(include='all')


Unnamed: 0,Age,Gender,Weight,Height,Medical_Conditions,Medication,Smoker,Alcohol_Consumption,Day_of_Week,Sleep_Duration,...,Calories_Burned,Distance_Covered,Exercise_Type,Exercise_Duration,Exercise_Intensity,Ambient_Temperature,Battery_Level,Altitude,UV_Exposure,Screen_Time
count,10000.0,10000,10000.0,10000.0,10000,10000,10000,10000.0,10000,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
unique,,3,,,3,2,2,3.0,7,,...,,,4.0,,4.0,,,,,
top,,Female,,,Diabetes,No,No,,Friday,,...,,,,,,,,,,
freq,,3363,,,3359,5015,5005,3360.0,1440,,...,,,2601.0,,2601.0,,,,,
mean,47.554308,,90.395729,170.54537,,,,,,7.014616,...,500.859684,8.015361,,0.924261,,28.110849,49.905499,2507.636651,3.998255,4.02956
std,21.932271,,34.974515,17.63198,,,,,,1.738549,...,291.268622,4.65751,,0.661948,,7.544267,29.278785,1451.825943,2.316499,2.297608
min,6.849308,,27.339296,137.423902,,,,,,3.836311,...,-30.748486,-0.385874,,-0.117917,,14.21417,-3.129898,-135.55049,-0.211745,-0.211003
25%,28.447815,,60.11376,155.38697,,,,,,5.5169,...,249.580781,3.98696,,0.057372,,21.521185,24.323081,1235.675527,1.986191,2.07481
50%,47.497941,,89.968312,170.598854,,,,,,6.990546,...,497.043512,7.965692,,0.996959,,28.198608,49.75977,2489.643634,4.028441,4.0303
75%,66.485969,,120.775612,185.686417,,,,,,8.540647,...,757.20363,12.123678,,1.48407,,34.606158,75.291352,3758.609088,5.996726,6.001143


In [8]:
data_noisy_swapped.head()

Unnamed: 0,Age,Gender,Weight,Height,Medical_Conditions,Medication,Smoker,Alcohol_Consumption,Day_of_Week,Sleep_Duration,...,Calories_Burned,Distance_Covered,Exercise_Type,Exercise_Duration,Exercise_Intensity,Ambient_Temperature,Battery_Level,Altitude,UV_Exposure,Screen_Time
0,61.241876,Other,87.403622,180.219816,Diabetes,No,No,Moderate,Friday,6.435855,...,473.965654,7.377567,Running,1.198729,Low,27.81004,99.172087,1752.9577,4.069153,1.837136
1,24.180989,Male,87.133708,200.171102,,Yes,No,Moderate,Friday,7.582438,...,477.502899,7.596697,Yoga,1.426628,Moderate,25.156062,67.321498,4065.719809,1.739521,0.23223
2,81.04735,Other,61.934124,196.148584,Hypertension,Yes,Yes,Moderate,Friday,8.536393,...,-15.359851,0.348899,Strength Training,0.810107,Moderate,29.320185,31.184439,1737.300871,0.313268,2.444003
3,68.432847,Other,139.229506,163.483943,Hypertension,No,No,Heavy,Friday,4.36107,...,652.830041,10.82727,,-0.011499,,25.512965,81.676668,4068.992074,4.482041,3.292287
4,28.597267,Other,82.143528,148.323603,Diabetes,Yes,Yes,Heavy,Friday,5.957705,...,-7.694097,0.046624,Yoga,0.786937,High,21.758482,18.452975,3419.065834,2.866447,2.637215


In [9]:
# Check the distribution of 'Mood' classes
mood_distribution = data_noisy_swapped['Mood'].value_counts()
mood_distribution

Anxious    2542
Sad        2515
Neutral    2495
Happy      2448
Name: Mood, dtype: int64

### **Data Cleaning**

In [10]:
# Checking for missing values
missing_values = data_noisy_swapped.isnull().sum()
missing_values

Age                    0
Gender                 0
Weight                 0
Height                 0
Medical_Conditions     0
Medication             0
Smoker                 0
Alcohol_Consumption    0
Day_of_Week            0
Sleep_Duration         0
Deep_Sleep_Duration    0
REM_Sleep_Duration     0
Wakeups                0
Snoring                0
Heart_Rate             0
Blood_Oxygen_Level     0
ECG                    0
Calories_Intake        0
Water_Intake           0
Stress_Level           0
Mood                   0
Skin_Temperature       0
Body_Fat_Percentage    0
Muscle_Mass            0
Health_Score           0
Steps                  0
Calories_Burned        0
Distance_Covered       0
Exercise_Type          0
Exercise_Duration      0
Exercise_Intensity     0
Ambient_Temperature    0
Battery_Level          0
Altitude               0
UV_Exposure            0
Screen_Time            0
dtype: int64

In [11]:
pd.set_option('display.max_columns', None)
data_noisy_swapped.head()

Unnamed: 0,Age,Gender,Weight,Height,Medical_Conditions,Medication,Smoker,Alcohol_Consumption,Day_of_Week,Sleep_Duration,Deep_Sleep_Duration,REM_Sleep_Duration,Wakeups,Snoring,Heart_Rate,Blood_Oxygen_Level,ECG,Calories_Intake,Water_Intake,Stress_Level,Mood,Skin_Temperature,Body_Fat_Percentage,Muscle_Mass,Health_Score,Steps,Calories_Burned,Distance_Covered,Exercise_Type,Exercise_Duration,Exercise_Intensity,Ambient_Temperature,Battery_Level,Altitude,UV_Exposure,Screen_Time
0,61.241876,Other,87.403622,180.219816,Diabetes,No,No,Moderate,Friday,6.435855,2.963589,3.855221,4,Yes,144.144549,90.316149,Abnormal,2442.121702,3.122524,Moderate,Neutral,32.749661,17.112365,77.854138,27.485952,9580.794149,473.965654,7.377567,Running,1.198729,Low,27.81004,99.172087,1752.9577,4.069153,1.837136
1,24.180989,Male,87.133708,200.171102,,Yes,No,Moderate,Friday,7.582438,0.783408,6.845142,2,Yes,143.774825,96.841503,Normal,2199.767782,2.38268,High,Anxious,35.097149,23.77347,71.569358,64.440856,9514.038367,477.502899,7.596697,Yoga,1.426628,Moderate,25.156062,67.321498,4065.719809,1.739521,0.23223
2,81.04735,Other,61.934124,196.148584,Hypertension,Yes,Yes,Moderate,Friday,8.536393,6.826452,1.749491,2,No,175.205899,99.476031,Abnormal,2225.933561,1.189508,Low,Sad,36.6814,23.325061,59.362407,77.477708,-250.889566,-15.359851,0.348899,Strength Training,0.810107,Moderate,29.320185,31.184439,1737.300871,0.313268,2.444003
3,68.432847,Other,139.229506,163.483943,Hypertension,No,No,Heavy,Friday,4.36107,2.584687,1.867396,2,Yes,159.980108,91.534763,Normal,1402.294841,1.687471,Moderate,Happy,32.790699,13.519946,67.959707,17.35727,13718.740505,652.830041,10.82727,,-0.011499,,25.512965,81.676668,4068.992074,4.482041,3.292287
4,28.597267,Other,82.143528,148.323603,Diabetes,Yes,Yes,Heavy,Friday,5.957705,4.442639,1.651606,3,No,86.917717,96.15549,Normal,1556.212196,1.473873,High,Happy,33.522228,24.177635,54.98057,43.140236,192.690458,-7.694097,0.046624,Yoga,0.786937,High,21.758482,18.452975,3419.065834,2.866447,2.637215


# Data Preprocessing: Health Data

### **Data Transformation**

In [12]:
numerical_cols = data_noisy_swapped.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data_noisy_swapped.select_dtypes(include=['object']).columns

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: Index(['Age', 'Weight', 'Height', 'Sleep_Duration', 'Deep_Sleep_Duration',
       'REM_Sleep_Duration', 'Wakeups', 'Heart_Rate', 'Blood_Oxygen_Level',
       'Calories_Intake', 'Water_Intake', 'Skin_Temperature',
       'Body_Fat_Percentage', 'Muscle_Mass', 'Health_Score', 'Steps',
       'Calories_Burned', 'Distance_Covered', 'Exercise_Duration',
       'Ambient_Temperature', 'Battery_Level', 'Altitude', 'UV_Exposure',
       'Screen_Time'],
      dtype='object')
Categorical Columns: Index(['Gender', 'Medical_Conditions', 'Medication', 'Smoker',
       'Alcohol_Consumption', 'Day_of_Week', 'Snoring', 'ECG', 'Stress_Level',
       'Mood', 'Exercise_Type', 'Exercise_Intensity'],
      dtype='object')


**Data Normalization**

In [None]:
combined_data.to_csv('combined_data.csv', index=False)

In [None]:
# Perform one-hot encoding
mood_encoded = pd.get_dummies(data_noisy_swapped['Mood'])

# Join the encoded DataFrame back with the original DataFrame
data_noisy_swapped = data_noisy_swapped.join(mood_encoded)

In [None]:
# Applying Min-Max scaling for Numerical Data

scaler = MinMaxScaler()
combined_data[numerical_cols] = scaler.fit_transform(combined_data[numerical_cols])

In [13]:
encoded = pd.get_dummies(data_noisy_swapped[categorical_cols])
data_noisy_swapped = data_noisy_swapped.join(encoded)

In [None]:
# Initialize LabelEncoder
encoder = OneHotEncoder()

for col in categorical_cols:
    combined_data[col] = encoder.fit_transform(combined_data[col])

In [14]:
data_noisy_swapped['Mood'].unique()

array(['Neutral', 'Anxious', 'Sad', 'Happy'], dtype=object)

In [15]:
data_noisy_swapped.head()

Unnamed: 0,Age,Gender,Weight,Height,Medical_Conditions,Medication,Smoker,Alcohol_Consumption,Day_of_Week,Sleep_Duration,Deep_Sleep_Duration,REM_Sleep_Duration,Wakeups,Snoring,Heart_Rate,Blood_Oxygen_Level,ECG,Calories_Intake,Water_Intake,Stress_Level,Mood,Skin_Temperature,Body_Fat_Percentage,Muscle_Mass,Health_Score,Steps,Calories_Burned,Distance_Covered,Exercise_Type,Exercise_Duration,Exercise_Intensity,Ambient_Temperature,Battery_Level,Altitude,UV_Exposure,Screen_Time,Gender_Female,Gender_Male,Gender_Other,Medical_Conditions_Diabetes,Medical_Conditions_Hypertension,Medical_Conditions_None,Medication_No,Medication_Yes,Smoker_No,Smoker_Yes,Alcohol_Consumption_Heavy,Alcohol_Consumption_Moderate,Alcohol_Consumption_None,Day_of_Week_Friday,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday,Snoring_No,Snoring_Yes,ECG_Abnormal,ECG_Normal,Stress_Level_High,Stress_Level_Low,Stress_Level_Moderate,Mood_Anxious,Mood_Happy,Mood_Neutral,Mood_Sad,Exercise_Type_None,Exercise_Type_Running,Exercise_Type_Strength Training,Exercise_Type_Yoga,Exercise_Intensity_High,Exercise_Intensity_Low,Exercise_Intensity_Moderate,Exercise_Intensity_None
0,61.241876,Other,87.403622,180.219816,Diabetes,No,No,Moderate,Friday,6.435855,2.963589,3.855221,4,Yes,144.144549,90.316149,Abnormal,2442.121702,3.122524,Moderate,Neutral,32.749661,17.112365,77.854138,27.485952,9580.794149,473.965654,7.377567,Running,1.198729,Low,27.81004,99.172087,1752.9577,4.069153,1.837136,0,0,1,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0
1,24.180989,Male,87.133708,200.171102,,Yes,No,Moderate,Friday,7.582438,0.783408,6.845142,2,Yes,143.774825,96.841503,Normal,2199.767782,2.38268,High,Anxious,35.097149,23.77347,71.569358,64.440856,9514.038367,477.502899,7.596697,Yoga,1.426628,Moderate,25.156062,67.321498,4065.719809,1.739521,0.23223,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0
2,81.04735,Other,61.934124,196.148584,Hypertension,Yes,Yes,Moderate,Friday,8.536393,6.826452,1.749491,2,No,175.205899,99.476031,Abnormal,2225.933561,1.189508,Low,Sad,36.6814,23.325061,59.362407,77.477708,-250.889566,-15.359851,0.348899,Strength Training,0.810107,Moderate,29.320185,31.184439,1737.300871,0.313268,2.444003,0,0,1,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0
3,68.432847,Other,139.229506,163.483943,Hypertension,No,No,Heavy,Friday,4.36107,2.584687,1.867396,2,Yes,159.980108,91.534763,Normal,1402.294841,1.687471,Moderate,Happy,32.790699,13.519946,67.959707,17.35727,13718.740505,652.830041,10.82727,,-0.011499,,25.512965,81.676668,4068.992074,4.482041,3.292287,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1
4,28.597267,Other,82.143528,148.323603,Diabetes,Yes,Yes,Heavy,Friday,5.957705,4.442639,1.651606,3,No,86.917717,96.15549,Normal,1556.212196,1.473873,High,Happy,33.522228,24.177635,54.98057,43.140236,192.690458,-7.694097,0.046624,Yoga,0.786937,High,21.758482,18.452975,3419.065834,2.866447,2.637215,0,0,1,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0


In [16]:
data_noisy_swapped =data_noisy_swapped.drop(categorical_cols, axis=1)

In [17]:
data_noisy_swapped

Unnamed: 0,Age,Weight,Height,Sleep_Duration,Deep_Sleep_Duration,REM_Sleep_Duration,Wakeups,Heart_Rate,Blood_Oxygen_Level,Calories_Intake,Water_Intake,Skin_Temperature,Body_Fat_Percentage,Muscle_Mass,Health_Score,Steps,Calories_Burned,Distance_Covered,Exercise_Duration,Ambient_Temperature,Battery_Level,Altitude,UV_Exposure,Screen_Time,Gender_Female,Gender_Male,Gender_Other,Medical_Conditions_Diabetes,Medical_Conditions_Hypertension,Medical_Conditions_None,Medication_No,Medication_Yes,Smoker_No,Smoker_Yes,Alcohol_Consumption_Heavy,Alcohol_Consumption_Moderate,Alcohol_Consumption_None,Day_of_Week_Friday,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday,Snoring_No,Snoring_Yes,ECG_Abnormal,ECG_Normal,Stress_Level_High,Stress_Level_Low,Stress_Level_Moderate,Mood_Anxious,Mood_Happy,Mood_Neutral,Mood_Sad,Exercise_Type_None,Exercise_Type_Running,Exercise_Type_Strength Training,Exercise_Type_Yoga,Exercise_Intensity_High,Exercise_Intensity_Low,Exercise_Intensity_Moderate,Exercise_Intensity_None
0,61.241876,87.403622,180.219816,6.435855,2.963589,3.855221,4,144.144549,90.316149,2442.121702,3.122524,32.749661,17.112365,77.854138,27.485952,9580.794149,473.965654,7.377567,1.198729,27.810040,99.172087,1752.957700,4.069153,1.837136,0,0,1,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0
1,24.180989,87.133708,200.171102,7.582438,0.783408,6.845142,2,143.774825,96.841503,2199.767782,2.382680,35.097149,23.773470,71.569358,64.440856,9514.038367,477.502899,7.596697,1.426628,25.156062,67.321498,4065.719809,1.739521,0.232230,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0
2,81.047350,61.934124,196.148584,8.536393,6.826452,1.749491,2,175.205899,99.476031,2225.933561,1.189508,36.681400,23.325061,59.362407,77.477708,-250.889566,-15.359851,0.348899,0.810107,29.320185,31.184439,1737.300871,0.313268,2.444003,0,0,1,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0
3,68.432847,139.229506,163.483943,4.361070,2.584687,1.867396,2,159.980108,91.534763,1402.294841,1.687471,32.790699,13.519946,67.959707,17.357270,13718.740505,652.830041,10.827270,-0.011499,25.512965,81.676668,4068.992074,4.482041,3.292287,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1
4,28.597267,82.143528,148.323603,5.957705,4.442639,1.651606,3,86.917717,96.155490,1556.212196,1.473873,33.522228,24.177635,54.980570,43.140236,192.690458,-7.694097,0.046624,0.786937,21.758482,18.452975,3419.065834,2.866447,2.637215,0,0,1,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15.002937,140.896758,153.095145,8.257811,0.795214,7.325345,0,75.204859,92.108138,2495.973886,1.023485,34.883269,33.635226,50.284791,31.050979,12618.309983,615.868637,10.080100,-0.004783,34.805663,64.816564,4766.286862,7.164247,5.510876,1,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1
9996,46.410364,89.999141,168.468866,5.323612,4.918421,0.543407,3,63.239175,92.791117,2226.415121,0.990258,37.756216,15.837183,46.093736,32.123062,4326.491837,197.565398,3.810964,0.649186,31.631581,63.415055,3743.795110,6.771968,3.326830,0,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0
9997,63.337834,45.890209,200.215656,9.060388,2.602544,6.426981,2,128.972514,96.082288,1229.475353,2.109125,36.119719,23.920007,70.283290,61.464705,3601.789884,222.759818,3.453638,0.891312,38.642425,87.668467,2582.231107,0.854001,5.823338,1,0,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
9998,26.690407,87.085594,164.210393,4.509077,0.961065,3.795282,0,91.818494,100.272634,2169.480290,1.730435,37.474359,10.213831,37.220676,61.316263,10714.138657,503.839516,8.419227,0.914820,34.887657,85.069289,2327.910907,1.744118,2.895781,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0


In [None]:
# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the data
encoded_data = encoder.fit_transform(combined_data[['Mood']])

# Add the encoded data to the original DataFrame
combined_data[encoder.get_feature_names_out(['Mood'])] = encoded_data

combined_data.head()

In [18]:
# Splitting data into training and testing sets

# Assuming 'Mood' is your target variable and it's already encoded
X = data_noisy_swapped.drop(['Mood_Anxious', 'Mood_Happy', 'Mood_Neutral', 'Mood_Neutral'], axis=1)
y = data_noisy_swapped[['Mood_Anxious', 'Mood_Happy', 'Mood_Neutral', 'Mood_Neutral']]   # Target variable

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       506
           1       1.00      1.00      1.00       492
           2       1.00      1.00      1.00       523
           3       1.00      1.00      1.00       479

   micro avg       1.00      1.00      1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000
 samples avg       1.00      1.00      1.00      2000



In [None]:
# Get feature importances
importances = rfc_model.feature_importances_

# Map these importances to the corresponding feature names
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the features by their importance
sorted_features = feature_importances.sort_values(by='Importance', ascending=False)

# Display sorted features
print(sorted_features)

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Prediction and Evaluation
dt_pred = dt_model.predict(X_test)
print(classification_report(y_test, dt_pred))


              precision    recall  f1-score   support

           0       0.33      0.34      0.33       506
           1       0.33      0.32      0.32       492
           2       0.36      0.35      0.36       523
           3       0.36      0.35      0.36       523

   micro avg       0.34      0.34      0.34      2044
   macro avg       0.34      0.34      0.34      2044
weighted avg       0.34      0.34      0.34      2044
 samples avg       0.26      0.26      0.26      2044



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)

y_pred = rfc_model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
print(confusion_matrix(y_test, y_pred_labels))
print(confusion_matrix(y_test, y_pred))


ValueError: ignored

In [21]:
# Assuming y_pred contains probabilities
y_pred_labels = np.argmax(y_pred, axis=1)

# If y_test is also one-hot encoded
y_test_labels = np.argmax(y_test, axis=1)

# Now you can use confusion_matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test_labels, y_pred_labels))

ValueError: ignored

In [27]:
from sklearn.naive_bayes import GaussianNB

# Creating the Naive Bayes model
nb_model = GaussianNB()

# Training the model
nb_model.fit(X_train, y_train)

# Making predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluating the model
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Naive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))


ValueError: ignored

In [None]:
from sklearn.model_selection import cross_val_score
# Example with a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

# Assuming X is your features and y is the encoded 'Mood' variable
scores = cross_val_score(clf, X, y, cv=5)  # 5-fold cross-validation

print("Accuracy scores for each fold:", scores)
print("Mean cross-validation accuracy:", scores.mean())

Accuracy scores for each fold: [1. 1. 1. 1. 1.]
Mean cross-validation accuracy: 1.0


In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt'],
    'max_depth' : [4, 6, 8],
    'criterion' :['gini', 'entropy']
}

cv_rfc = RandomizedSearchCV(estimator=rfc_model, param_distributions= param_grid, cv=5)
cv_rfc.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [None]:
# Assuming df is your DataFrame and 'Mood' is your target variable
class_counts = combined_data['Mood'].value_counts()
print(class_counts)

0    2542
3    2515
2    2495
1    2448
Name: Mood, dtype: int64


In [22]:
from sklearn.model_selection import cross_val_score
# Select top 10 features
top_features = ['Skin_Temperature', 'Screen_Time', 'Water_Intake', 'Calories_Intake', 'Battery_Level',
                'Height', 'Muscle_Mass', 'UV_Exposure', 'Body_Fat_Percentage', 'Altitude']

# Create new dataset with selected features
X_reduced = combined_data[top_features]

# Split the reduced dataset
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Example with a Random Forest Classifier
clf_new = RandomForestClassifier(random_state=42)

# Assuming X is your features and y is the encoded 'Mood' variable
scores = cross_val_score(clf_new, X, y, cv=5)  # 5-fold cross-validation

print("Accuracy scores for each fold:", scores)
print("Mean cross-validation accuracy:", scores.mean())


Accuracy scores for each fold: [0.2575 0.2505 0.255  0.2485 0.2485]
Mean cross-validation accuracy: 0.252
