1. Loading and Inspecting the Data

In [10]:
import pandas as pd

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Display basic information about the dataset
data_info = data.info()
print(data_info)

# Display the first few rows to understand the structure
data_head = data.head()
print(data_head)

# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values:\n", missing_values)

# Explore unique values in categorical columns
unique_values_gender = data['gender'].unique()
unique_values_smoking_history = data['smoking_history'].unique()

print("\nUnique Values in 'gender':", unique_values_gender)
print("Unique Values in 'smoking_history':", unique_values_smoking_history)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB
None
   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0      

2. Handling Categorical Columns

In [11]:
from sklearn.preprocessing import LabelEncoder

# Convert the 'gender' column using Label Encoding
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])

# Display the transformed data to verify changes
print("\nTransformed Data (Gender Encoded):\n", data.head())



Transformed Data (Gender Encoded):
    gender   age  hypertension  heart_disease smoking_history    bmi  \
0       0  80.0             0              1           never  25.19   
1       0  54.0             0              0         No Info  27.32   
2       1  28.0             0              0           never  27.32   
3       0  36.0             0              0         current  23.45   
4       1  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


In [12]:
# Replace 'No Info' with 'Unknown' and use one-hot encoding
data['smoking_history'] = data['smoking_history'].replace('No Info', 'Unknown')
data = pd.get_dummies(data, columns=['smoking_history'])

# Display the data to verify changes
print("\nData After Handling 'smoking_history':\n", data.head())



Data After Handling 'smoking_history':
    gender   age  hypertension  heart_disease    bmi  HbA1c_level  \
0       0  80.0             0              1  25.19          6.6   
1       0  54.0             0              0  27.32          6.6   
2       1  28.0             0              0  27.32          5.7   
3       0  36.0             0              0  23.45          5.0   
4       1  76.0             1              1  20.14          4.8   

   blood_glucose_level  diabetes  smoking_history_Unknown  \
0                  140         0                    False   
1                   80         0                     True   
2                  158         0                    False   
3                  155         0                    False   
4                  155         0                    False   

   smoking_history_current  smoking_history_ever  smoking_history_former  \
0                    False                 False                   False   
1                    False     

3. Preparing Data for Model Training

In [13]:
# Prepare features and target variable
X = data.drop('diabetes', axis=1)
y = data['diabetes']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


4. Build and Train the Random Forest Model

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Build and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)


5. Evaluate the Model

In [15]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nAccuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", report)



Accuracy: 97.02%

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     18292
           1       0.94      0.69      0.80      1708

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000

