In [7]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE

# Load the MADRS scores
scores = pd.read_csv(
    'C:/Users/shrey/OneDrive/Desktop/depression du project/data/scores.csv')

# Ensure no missing values in 'number' column and convert to integer
scores = scores.dropna(subset=['number'])

# Identify non-numeric values in the 'number' column
non_numeric_values = scores[pd.to_numeric(
    scores['number'], errors='coerce').isnull()]['number']
if not non_numeric_values.empty:
    print("Non-numeric values found in 'number' column:")
    print(non_numeric_values)
    raise ValueError("Non-numeric values found in 'number' column")

scores['number'] = scores['number'].astype(int)

# Directory paths for control and condition data
control_dir = 'C:/Users/shrey/OneDrive/Desktop/depression du project/data/control/'
condition_dir = 'C:/Users/shrey/OneDrive/Desktop/depression du project/data/condition/'

# Load activity data for controls and conditions
control_files = [os.path.join(control_dir, f)
                 for f in os.listdir(control_dir) if f.endswith('.csv')]
condition_files = [os.path.join(condition_dir, f) for f in os.listdir(
    condition_dir) if f.endswith('.csv')]

# Combine all activity data into a single DataFrame
activity_data = []
for file in control_files + condition_files:
    data = pd.read_csv(file)
    # Extract patient ID from filename
    data['patient_id'] = int(file.split('/')[-1].split('.')[0].split('_')[-1])
    activity_data.append(data)

activity_data = pd.concat(activity_data)

# Convert 'patient_id' to integer
activity_data['patient_id'] = activity_data['patient_id'].astype(int)

# Merge activity data with scores
merged_data = pd.merge(activity_data, scores,
                       left_on='patient_id', right_on='number')
merged_data.drop(columns=['number', 'timestamp', 'date'], inplace=True)

# Handling missing values
merged_data.fillna(method='ffill', inplace=True)

# Encode categorical features
label_encoders = {}
for column in ['gender', 'afftype', 'melanch', 'inpatient', 'edu', 'marriage', 'work']:
    le = LabelEncoder()
    merged_data[column] = le.fit_transform(merged_data[column])
    label_encoders[column] = le

# Standardize features
scaler = StandardScaler()
features = merged_data.drop(columns=['madrs1', 'madrs2'])
features_scaled = scaler.fit_transform(features)

# Target variable
target = merged_data['madrs2']

# Save the cleaned and preprocessed data to CSV
merged_data.to_csv('cleaned_preprocessed_data.csv', index=False)

print("Data cleaning and preprocessing complete.")

Non-numeric values found in 'number' column:
0      condition_1
1      condition_2
2      condition_3
3      condition_4
4      condition_5
5      condition_6
6      condition_7
7      condition_8
8      condition_9
9     condition_10
10    condition_11
11    condition_12
12    condition_13
13    condition_14
14    condition_15
15    condition_16
16    condition_17
17    condition_18
18    condition_19
19    condition_20
20    condition_21
21    condition_22
22    condition_23
23       control_1
24       control_2
25       control_3
26       control_4
27       control_5
28       control_6
29       control_7
30       control_8
31       control_9
32      control_10
33      control_11
34      control_12
35      control_13
36      control_14
37      control_15
38      control_16
39      control_17
40      control_18
41      control_19
42      control_20
43      control_21
44      control_22
45      control_23
46      control_24
47      control_25
48      control_26
49      control_27
50   

ValueError: Non-numeric values found in 'number' column

In [None]:

# Combine all activity data into a single DataFrame
activity_data = []
for file in control_files + condition_files:
    data = pd.read_csv(file)
    # Extract patient ID from filename
    data['patient_id'] = int(file.split('/')[-1].split('.')[0])
    activity_data.append(data)

activity_data = pd.concat(activity_data)

# Merge activity data with scores
merged_data = pd.merge(activity_data, scores,
                       left_on='patient_id', right_on='number')
merged_data.drop(columns=['number', 'timestamp', 'date'], inplace=True)

# Handling missing values
merged_data.fillna(method='ffill', inplace=True)

# Encode categorical features
label_encoders = {}
for column in ['gender', 'afftype', 'melanch', 'inpatient', 'edu', 'marriage', 'work']:
    le = LabelEncoder()
    merged_data[column] = le.fit_transform(merged_data[column])
    label_encoders[column] = le

# Standardize features
scaler = StandardScaler()
features = merged_data.drop(columns=['madrs1', 'madrs2'])
features_scaled = scaler.fit_transform(features)

# Target variable
target = merged_data['madrs2']

# Load the scores.csv file

scores_df = pd.read_csv('path_to_dataset/scores.csv')