<a href="https://colab.research.google.com/github/shafiqulrehman/AlzheimerCode/blob/main/Alzheimer_Dara_Pre_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**Data pre-processing pipe-line**



1.   Null value removal
2.   Detection aof outliers by using IQR, and replacing them by mode values.
3. Performing Z-score normalization to bring the features in a single scale
4. Removing highly correlated features(>90%)





In [None]:
### Removing null values
import pandas as pd

# Step 1: Read the CSV file
input_file = '/content/data.csv'
df = pd.read_csv(input_file)

# Step 2: Delete the first column
df = df.iloc[:, 1:]

# Step 3: Check for null values and replace them with the mode
null_columns = df.columns[df.isnull().any()]
for column in null_columns:
    mode_value = df[column].mode()[0]  # Get the mode value of the column
    df[column].fillna(mode_value, inplace=True)

# Step 4: Store the filtered dataset in a new CSV file
output_file = 'data_free_null.csv'
df.to_csv(output_file, index=False)

print(f"Filtered data saved to {output_file}")


Filtered data saved to data_free_null.csv


In [None]:
### Detecting the outliers in each feature and replacing them by the mode value of that feature
import pandas as pd
import numpy as np

# Suppressing the warning
pd.set_option('mode.chained_assignment', None)

# Step 1: Read the CSV file
input_file = '/content/data_free_null.csv'
df = pd.read_csv(input_file)

# Step 2: Define a function to replace outliers with the mode of the column
def replace_outliers_with_mode(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Replace outliers with the mode of the column
    column[(column < lower_bound) | (column > upper_bound)] = column.mode().iloc[0]
    return column

# Step 3: Loop through numeric columns (except the last one) and replace outliers with the mode
numeric_columns = df.iloc[:, :-1].select_dtypes(include=[np.number]).columns
for column in numeric_columns:
    df[column] = replace_outliers_with_mode(df[column])

# Step 4: Store the modified dataset in a new CSV file
output_file = 'data_free_null_outlier.csv'
df.to_csv(output_file, index=False)

print(f"Outliers replaced and filtered data saved to {output_file}")


Outliers replaced and filtered data saved to data_free_null_outlier.csv


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Step 1: Read the CSV file
input_file = 'data_free_null_outlier.csv'
df = pd.read_csv(input_file)

# Step 2: Exclude the last column (class label)
data = df.iloc[:, :-1]

# Step 3: Apply Z-score normalization to standardize the dataset
scaler = StandardScaler()
normalized_data = scaler.fit_transform(data)

# Step 4: Create a new DataFrame with the standardized data
normalized_df = pd.DataFrame(normalized_data, columns=data.columns)

# Step 5: Concatenate the standardized data with the last column (class label)
normalized_df = pd.concat([normalized_df, df.iloc[:, -1]], axis=1)

# Step 6: Store the standardized dataset in a new CSV file
output_file = 'data_free_null_outlier_normalized.csv'
normalized_df.to_csv(output_file, index=False)

print(f"Z-score normalized data saved to {output_file}")


Z-score normalized data saved to data_free_null_outlier_normalized.csv


In [None]:
import pandas as pd
import numpy as np

# Step 1: Read the CSV file
csv_file = '/content/data_free_null_outlier_normalized.csv'  # Replace with your CSV file path
df = pd.read_csv(csv_file)

# Step 2: Compute the correlation matrix, ignoring the last column (class label)
correlation_matrix = df.iloc[:, :-1].corr()

# Step 3: Find highly correlated columns
highly_correlated = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.90:
            column_i = correlation_matrix.columns[i]
            column_j = correlation_matrix.columns[j]
            # Add both columns to the set of highly correlated columns
            highly_correlated.add(column_i)
            highly_correlated.add(column_j)

# Step 4: Remove one of the columns from each highly correlated pair
df_filtered = df.drop(columns=list(highly_correlated))

# Step 5: Save the filtered DataFrame to a new CSV file
filtered_csv_file = 'data_null_outlier_free_normalized_filtered_data.csv'  # Replace with your desired output file path
df_filtered.to_csv(filtered_csv_file, index=False)

# Display the filtered DataFrame
print("Filtered DataFrame:")
print(df_filtered.head())


Filtered DataFrame:
   air_time1  disp_index1  gmrt_in_air1  max_x_extension1  max_y_extension1  \
0   1.046526     1.145182     -0.982342         -1.352980         -0.270435   
1  -1.166959     2.515942     -1.023595          0.111876         -0.020048   
2  -0.160401     0.283561     -0.161713          1.381949         -0.774361   
3  -0.381985     0.283561      0.887062          0.235107          0.712190   
4  -0.297123    -1.063701      0.049315         -1.293352         -1.449206   

   mean_acc_on_paper1  mean_gmrt1  mean_jerk_on_paper1  mean_speed_in_air1  \
0            1.833678   -1.177858             0.760912           -0.960257   
1           -0.375818   -1.224259            -0.738144           -0.967027   
2            0.734186   -0.159928             0.304085            0.055691   
3            0.221535    0.622423             0.633182            1.172486   
4           -0.368987   -0.334366            -0.189763            0.335069   

   num_of_pendown1  ...  max_x_exten

In [None]:
import pandas as pd

# Replace 'your_file.csv' with the actual path to your CSV file
file_path = '/content/data_null_outlier_free_normalized_filtered_data.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the number of rows and columns
num_rows, num_columns = df.shape
print("Dataset after removing the highly correlated features (>90%)")
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")


Dataset after removing the highly correlated features (>90%)
Number of rows: 174
Number of columns: 338
