In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [None]:
# Function to load the dataset from a URL
def load_dataset(url):
    return pd.read_csv(url)

In [None]:
# Function to visualize the distribution of a feature
def plot_data_distribution(data, feature):
    plt.figure(figsize=(10, 6))
    plt.hist(data[feature], bins=50, color='blue', alpha=0.7)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Load the dataset
diabetes_df = load_dataset('https://raw.githubusercontent.com/raccamateo/NEC_A2/main/diabetes_original.csv')

In [None]:
# Data exploration
print(diabetes_df.head())
print(diabetes_df.describe())

In [None]:
# Visualize the distribution of each numerical feature in the dataset
numerical_features = diabetes_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in numerical_features:
    plot_data_distribution(diabetes_df, feature)

In [None]:
# Shuffle the data to remove any inherent sorting
diabetes_df_shuffled = diabetes_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Split the data into training/validation (80%) and testing (20%) sets
train_val_df, test_df = train_test_split(diabetes_df_shuffled, test_size=0.2, random_state=42)

In [None]:
# Identify features and target column
# Assuming the last column is the target for classification
features = train_val_df.columns[:-1]
target = train_val_df.columns[-1]

In [None]:
# Normalize the numerical features using StandardScaler
scaler = StandardScaler()
train_val_df[features] = scaler.fit_transform(train_val_df[features])
test_df[features] = scaler.transform(test_df[features])

In [None]:
# Save the preprocessed data
train_val_df.to_csv('diabetes_train_val.csv', index=False)
test_df.to_csv('diabetes_test.csv', index=False)