In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'diabetes.csv'  
df = pd.read_csv(file_path)

# Check for missing values
print("Missing values before preprocessing:")
print(df.isnull().sum())

# Handling missing values (assuming missing values are represented as zeros for certain variables)
# Replace zeros with NaN for relevant columns
cols_with_zeros_as_missing = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zeros_as_missing] = df[cols_with_zeros_as_missing].replace(0, pd.NA)

# Impute missing values using mean or median
df.fillna(df.median(), inplace=True)

# Check for missing values after preprocessing
print("\nMissing values after preprocessing:")
print(df.isnull().sum())

# Handling outliers (considering a simple approach, you might choose a more sophisticated method based on the distribution of your data)
# Define numerical columns for outlier detection
numerical_columns = df.columns.difference(['Outcome'])

# Visualize boxplots to identify outliers
plt.figure(figsize=(12, 8))
sns.boxplot(data=df[numerical_columns])
plt.title('Boxplots of Numerical Variables')
plt.show()

# Remove outliers using Z-score method
from scipy.stats import zscore

z_scores = zscore(df[numerical_columns])
df_no_outliers = df[(z_scores < 3).all(axis=1)]

# Transform categorical variables into dummy variables if necessary
# (In this case, there are no categorical variables, but if present, you can use pd.get_dummies())

# Display the shape of the dataset after preprocessing
print("\nShape of the dataset after preprocessing:")
print(df_no_outliers.shape)
