# PRE-PROCESSING :

In [2]:
# imports

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn import tree 
from sklearn.preprocessing import MinMaxScaler
import numpy as np

from scipy.stats import zscore


In [3]:
data = pd.read_csv('diabetes_012_health_indicators_502Rows.csv')
df = pd.DataFrame(data)

## Cleaning : Outlier

We detect outliers to find data points that are unusually far from the majority of the data, which could affect our analysis or predictions.
Then we remove rows with outliers to ensure that they don't distort the analysis or affect the performance of our models, as outliers can skew statistical measures and predictions.

In [4]:
# Define the columns to detect outliers using z-scores

columns_to_detect_outliers = ['BMI', 'Age', 'GenHlth','Education']

# Calculate z-scores for the selected columns
z_scores = df[columns_to_detect_outliers].apply(zscore)

# Define a threshold value
threshold = 2

# Identify outliers
outliers = df[(abs(z_scores) > threshold).any(axis=1)]

print("Outliers based on z-scores for the selected columns: \n")
display(outliers)

Outliers based on z-scores for the selected columns: 



Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education
10,0,0,0,1,28,0,0,0,1,1,0,1,0,1,0,0,2,5
13,0,1,1,1,45,0,0,0,0,1,1,1,0,3,0,1,5,6
17,0,0,0,1,27,0,0,0,0,0,1,1,0,1,0,0,3,6
18,0,0,0,0,23,0,0,0,0,0,1,1,0,2,0,0,2,6
21,0,1,1,1,38,1,0,0,0,1,1,1,0,5,1,0,13,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,2,1,0,1,23,1,0,0,1,1,1,1,0,1,0,0,6,4
481,2,1,1,1,24,1,1,1,1,1,1,1,0,4,0,0,12,2
490,2,1,1,1,47,1,0,0,0,1,0,1,0,3,1,0,11,6
495,2,1,0,1,33,0,0,0,1,0,1,1,0,4,1,0,11,2


In [5]:
#Remove the rows with outliers
df_no_outlier= df.drop(outliers.index)

#count the removed rows
df_no_outlierRows=df_no_outlier.shape[0]

print("After removing outliers from the selected columns \n ")

display(df_no_outlier.head(14))

print("\nnumber of rows after remove outliers: \n"+ str(df_no_outlierRows) +"\n")

After removing outliers from the selected columns 
 


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education
0,0,1,1,1,40,1,0,0,0,0,1,1,0,5,1,0,9,4
1,0,0,0,0,25,1,0,0,1,0,0,0,1,3,0,0,7,6
2,0,1,1,1,28,0,0,0,0,1,0,1,1,5,1,0,9,4
3,0,1,0,1,27,0,0,0,1,1,1,1,0,2,0,0,11,3
4,0,1,1,1,24,0,0,0,1,1,1,1,0,2,0,0,11,5
5,0,1,1,1,25,1,0,0,1,1,1,1,0,2,0,1,10,6
6,0,1,0,1,30,1,0,0,0,0,0,1,0,3,0,0,9,6
7,0,1,1,1,25,1,0,0,1,0,1,1,0,3,1,0,11,4
8,0,1,0,1,23,0,0,0,0,1,1,1,0,3,0,1,7,5
9,0,0,0,1,24,0,0,0,0,0,1,1,0,2,0,1,8,4



number of rows after remove outliers: 
375



## Data Transformation: Discretization

To categorize the 'BMI' values into a smaller number of bins for easier interpretation and analysis, facilitating pattern recognition and comparison between different BMI ranges.

In [6]:

columns_to_Discretize='BMI'

binsN=6

df_no_outlier['Discretized_'+ columns_to_Discretize]= pd.cut(df_no_outlier[columns_to_Discretize], bins=binsN, labels=False)
disCol='Discretized_'+ columns_to_Discretize

# Drop the original column 'BMI' and rename the discretized column

df_no_outlier.drop(columns_to_Discretize, axis=1, inplace=True)
df_no_outlier.rename(columns={columns_to_Discretize : disCol }, inplace=True)

# Save the updated DataFrame to a new CSV file
print("Original Data:\n")
display(df[['BMI']].head())

print ("\nDiscretized Data:\n")
display(df_no_outlier[[disCol]].head())

Original Data:



Unnamed: 0,BMI
0,40
1,25
2,28
3,27
4,24



Discretized Data:



Unnamed: 0,Discretized_BMI
0,5
1,1
2,2
3,2
4,1


## Data Transformation: Normalization

We normalize columns to bring their values to a similar scale, making comparisons and analyses fairer and more accurate, especially for machine learning models that are sensitive to the scale of features.

In [7]:

#Extract columns to normalize

columns_to_normalize = df_no_outlier.columns[1:]
data_to_normalize=df_no_outlier[columns_to_normalize]

# Min-Max scaling for selected columns
normalized_data_minmax = MinMaxScaler().fit_transform(data_to_normalize)

# Replace the normalized values in the original DataFrame
df_no_outlier[columns_to_normalize] = normalized_data_minmax
print ("BMI column after normalize:  \n ")
display (df_no_outlier[columns_to_normalize].head())

BMI column after normalize:  
 


Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Discretized_BMI
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.6,0.333333,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.4,1.0,0.2
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.6,0.333333,0.4
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.8,0.0,0.4
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.8,0.666667,0.2


## Integration: Correlation 

 Remove the highly correlated attributes to prevent multicollinearity issues and enhance the efficiency of predictive models by reducing redundancy in the dataset.


In [8]:

# Calculate the correlation matrix
correlation_matrix = df_no_outlier.corr()

# Adjust the correlation threshold based on the characteristics of the new dataset
correlation_threshold = 0.75

# Find highly correlated pairs and remove one of the attributes
highly_correlated_pairs = np.where(np.abs(correlation_matrix) >= correlation_threshold)

attributes_to_remove = set()

for i, j in zip(*highly_correlated_pairs):
    if i != j and i not in attributes_to_remove and j not in attributes_to_remove:
        # Check if both attributes are not in the removal set
        attribute_i = dfr.columns[i]
        attribute_j = df.columns[j]
        attributes_to_remove.add(attribute_j)

# Remove the highly correlated attributes
dfCorr = df_no_outlier.drop(columns=attributes_to_remove)



if not df_no_outlier.equals(dfCorr):
    print("Original DataFrame:")
    display(df_no_outlier)
    print("\nDataFrame after removing highly correlated attributes:")
    display(dfCorr)
else:
     print ("\nno highly correlated attributes \n")


no highly correlated attributes 



## Feature Selection

To identify the most relevant features that have the highest correlation with the target variable, aiding in model performance improvement.

In [11]:
 from sklearn.feature_selection import SelectKBest,f_classif

# Feature selection Method: Correlation-based Feature Selection


y = data.iloc[:, 0]  # Select the first column as the target variable
X = data.iloc[:, 1:] # Select all columns except the first one as features

# Use SelectKBest with f_classif as the scoring function
selector = SelectKBest(score_func=f_classif, k=2)  # Select top 2 features
X_new = selector.fit_transform(X, y)

# Display the selected features
selected_features = X.columns[selector.get_support()]

print("the features with the highest correlation with the target variable: \nSelected Features:", selected_features)



the features with the highest correlation with the target variable: 
Selected Features: Index(['HighBP', 'GenHlth'], dtype='object')


In [None]:
#create new csv file with preprocessing
dfCorr.to_csv('preprocessed_Diabetes.csv',index=False)