In [1]:
import pandas as pd

# Load the dataset
diabetes_data = pd.read_csv("Diabetes Classification.csv")

# Drop unnamed or index columns
diabetes_data = diabetes_data.loc[:, ~diabetes_data.columns.str.contains('^Unnamed')]
# This line drops any column with 'Unnamed' in its name.

# Display the shape of the dataset
print("Number of rows and columns after removing irrelevant columns:", diabetes_data.shape)

# Check the types of data
data_types = diabetes_data.dtypes
print("\nData types present in the dataset:")
print(data_types)

# Display statistical summary for numerical attributes
numerical_summary = diabetes_data.describe().T
print("\nStatistical summary for numerical attributes:")
print(numerical_summary)


Number of rows and columns after removing irrelevant columns: (5132, 10)

Data types present in the dataset:
Age            int64
Gender        object
BMI            int64
Chol         float64
TG           float64
HDL          float64
LDL          float64
Cr           float64
BUN          float64
Diagnosis      int64
dtype: object

Statistical summary for numerical attributes:
            count       mean        std        min    25%    50%    75%  \
Age        5132.0  48.950312  14.048794  20.000000  36.00  49.00  59.00   
BMI        5132.0  24.613406   4.277205  15.000000  22.00  24.00  27.00   
Chol       5132.0   4.866882   1.001052   0.000000   4.19   4.80   5.46   
TG         5132.0   1.719328   1.327057   0.000000   0.91   1.38   2.10   
HDL        5132.0   1.593305   1.038849   0.000000   1.09   1.30   1.59   
LDL        5132.0   2.914121   0.945423   0.300000   2.29   2.79   3.40   
Cr         5132.0  71.144800  28.494394   4.860753  58.00  70.20  81.60   
BUN        5132.0   

In [2]:
# Counting the number of individuals diagnosed with diabetes
diabetes_count = diabetes_data[diabetes_data['Diagnosis'] == 1].shape[0]
print("Total number of individuals diagnosed with diabetes:", diabetes_count)

# Filtering the dataset for only individuals with a BMI over 25 
high_bmi_data = diabetes_data[diabetes_data['BMI'] > 25]

# Calculating the average cholesterol level for this group
average_cholesterol = high_bmi_data['Chol'].mean()
print("Average cholesterol level for individuals with BMI over 25 is", average_cholesterol)

# Identifying the gender distribution within the dataset
gender_distribution = diabetes_data['Gender'].value_counts()
print("\nGender distribution within the dataset:")
print(gender_distribution)


Total number of individuals diagnosed with diabetes: 1993
Average cholesterol level for individuals with BMI over 25 is 5.002230568460308

Gender distribution within the dataset:
M    3256
F    1875
f       1
Name: Gender, dtype: int64


In [3]:
# For individuals with diabetes, calculate the average age for each gender
diabetes_data_filtered = diabetes_data[diabetes_data['Diagnosis'] == 1]
average_age_by_gender = diabetes_data_filtered.groupby('Gender')['Age'].mean()
print("Average age for individuals with diabetes by gender:")
print(average_age_by_gender)

# Calculate the average Creatinine (Cr) levels for males and females
average_creatinine_by_gender = diabetes_data.groupby('Gender')['Cr'].mean()
print("\nAverage Creatinine levels for males and females:")
print(average_creatinine_by_gender)

# Calculating the average LDL levels for individuals without diabetes, and with high triglyceride levels (TG > 1.5)
no_diabetes_high_triglyceride = diabetes_data[(diabetes_data['Diagnosis'] == 0) & (diabetes_data['TG'] > 1.5)]
average_ldl_for_high_triglyceride = no_diabetes_high_triglyceride['LDL'].mean()
print("\nAverage LDL levels for individuals without diabetes and high triglyceride levels:", average_ldl_for_high_triglyceride)


Average age for individuals with diabetes by gender:
Gender
F    58.416116
M    58.067206
f    55.000000
Name: Age, dtype: float64

Average Creatinine levels for males and females:
Gender
F    57.859409
M    78.806733
f    34.000000
Name: Cr, dtype: float64

Average LDL levels for individuals without diabetes and high triglyceride levels: 2.9291171003717467


In [4]:
# Grouping the dataset by 'Diagnosis' and calculate the mean for all numerical columns
mean_by_diagnosis = diabetes_data.groupby('Diagnosis').mean(numeric_only=True)
print("Mean values for numerical columns grouped by 'Diagnosis':")
print(mean_by_diagnosis)

# For each BMI category, calculate the average cholesterol level
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi >= 18.5 and bmi < 25:
        return 'Normal weight'
    elif bmi >= 25 and bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

# With the help of ChatGPT
diabetes_data['BMI Category'] = diabetes_data['BMI'].apply(categorize_bmi)
average_cholesterol_by_bmi = diabetes_data.groupby('BMI Category')['Chol'].mean()
print("\nAverage cholesterol level for each BMI category:")
print(average_cholesterol_by_bmi)

# Group the data by 'Gender' and 'Diagnosis', and finding the average age in each group
average_age_by_gender_diagnosis = diabetes_data.groupby(['Gender', 'Diagnosis'])['Age'].mean()
print("\nAverage age by gender and diagnosis:")
print(average_age_by_gender_diagnosis)


Mean values for numerical columns grouped by 'Diagnosis':
                 Age        BMI      Chol        TG       HDL       LDL  \
Diagnosis                                                                 
0          43.078687  23.156419  4.762265  1.463711  1.337069  2.740436   
1          58.198194  26.908179  5.031656  2.121927  1.996879  3.187675   

                  Cr       BUN  
Diagnosis                       
0          71.991555  4.718308  
1          69.811152  5.178362  

Average cholesterol level for each BMI category:
BMI Category
Normal weight    4.802203
Obese            4.969413
Overweight       4.986071
Underweight      4.436639
Name: Chol, dtype: float64

Average age by gender and diagnosis:
Gender  Diagnosis
F       0            42.849732
        1            58.416116
M       0            43.205344
        1            58.067206
f       1            55.000000
Name: Age, dtype: float64
