<a href="https://colab.research.google.com/github/ranadeepbhuyan/cancer-mri-analysis/blob/main/clinical/ideal_BMI_of_patients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import necessary libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

##Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Necessary Paths

In [None]:
train_path = r"/content/drive/MyDrive/body weight prediction/heart_data_with_BMI - heart_data.csv"

test_path = r"/content/drive/MyDrive/body weight prediction/UPENN-GBM_clinical_info_v1.1_modified_REAL - UPENN-GBM_clinical_info_v1.1_modified_with_weight).csv"

test_path2 = r"/content/drive/MyDrive/body weight prediction/UCSF-PDGM-metadata_v3_modeified - UCSF-PDGM-metadata_v2_modeified.csv"

##Converted into dataframe to extracted required data

In [None]:
data_path = pd.read_csv(train_path)
data_train = pd.DataFrame(data_path)
#remove columns which are not relevent
column_to_removed = data_train.columns.tolist()
index_required = [2, 3, 5,6]
updated_list = [column_to_removed[i] for i in range(len(column_to_removed)) if i not in index_required]
data_train = data_train.drop(columns = updated_list)
#Change age in years instead of days and round off
data_train['age'] = (data_train['age'] / 365).round().astype(int)
data_train['weight'] = (data_train['weight']).round().astype(int)

data_train.head(20)


Unnamed: 0,age,gender,weight,BMI
0,50,2,62,22.0
1,55,1,85,34.9
2,52,1,64,23.5
3,48,2,82,28.7
4,48,1,56,23.0
5,60,1,67,29.4
6,61,1,93,37.7
7,62,2,95,30.0
8,48,1,71,28.4
9,54,1,68,25.3


## Taking the average weight of each age and each gender

## Now we each age again have 2 genders. so we have taking average weight of each age group of each gender to generalised our data set.

## Now taking out the index of each gender in each age and taking the average of each weight group and insert it to the respective index in the training dataset.

In [None]:
#function to find people of each age
def grouping( from_, to_):
  list_1 = []
  list_2 = []
  for i in range(len(data_train['age'])):
    if data_train['age'][i] >= from_  and data_train['age'][i] <= to_:
      if data_train['gender'][i] == 1:
        list_1.append(i)
      if data_train['gender'][i] == 2:
        list_2.append(i)
  return list_1, list_2

#function to calculate the average weight
def average(list):
  list_w = []
  if len(list) != 0:
    for i in list:
      weight = data_train['weight'][i]
      list_w.append(weight)
    return sum(list_w)/len(list)

#function to replace the orginal weights with average weights
def replacement(list, weight_avg, data):
  for i in list:
      data.loc[i, 'weight'] = int(round(weight_avg))

#
list3 = []
for i in range(0,36):
  list1, list2 = grouping( 30+ i, 30 + i)
  avg_weight1 = average(list1)
  avg_weight2 = average(list2)
  replacement(list1, avg_weight1, data_train)
  replacement(list2, avg_weight2, data_train)


In [None]:
data_train.tail()

Unnamed: 0,age,gender,weight,BMI
69995,53,2,77,26.9
69996,62,1,73,50.5
69997,52,2,77,31.4
69998,61,1,74,27.1
69999,56,1,73,24.9


## Adding new rows

In [None]:
data_path = pd.read_csv(test_path)
data_predict = pd.DataFrame(data_path)
#remove columns which are not relevent
column_to_removed = data_predict.columns.tolist()
index_required = [1, 2,3]
updated_list = [column_to_removed[i] for i in range(len(column_to_removed)) if i not in index_required]
data_predict = data_predict.drop(columns = updated_list)
#Replace columns
column_data = data_predict.pop('Age_at_scan_years')
data_predict.insert(0, 'Age_at_scan_years', column_data)
#Replace symbols to numbers
data_predict['Gender'] = data_predict['Gender'].replace('F', 2)
data_predict['Gender'] = data_predict['Gender'].replace('M', 1)
#Rename columns
data_predict = data_predict.rename(columns={'Age_at_scan_years': 'age'})
data_predict = data_predict.rename(columns={'Gender': 'gender'})
data_predict = data_predict.rename(columns={'weight(kg)': 'weight'})
data_predict.head()


Unnamed: 0,age,gender,weight
0,52,2,77
1,61,2,77
2,43,1,71
3,33,1,69
4,53,1,73


In [None]:
#Checking for missing values in training dataset
print(data_train.isnull().sum())


age       0
gender    0
weight    0
BMI       0
dtype: int64


## Spliting data into training and testing

In [None]:
#convert into float
data_train = data_train.astype(float)

X = data_train.drop(columns = ['BMI'], axis = 1)
y = data_train['BMI']
X_array = X.values
y_array = y.values
#divide the dataset into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.2, random_state=42)


##Using Decision tree classifier to predict the weight

In [None]:
#model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

#predict the unseen data
y_predict = model.predict(data_predict)

Mean Squared Error: 35.76667959635361




In [None]:
y_predict.min()

24.24872481017931

In [None]:
#converted the predictd weights into data frame
dataframe = pd.DataFrame(y_predict)

## save the BMI and add them to the Upenn clinical dataset

In [None]:
dataframe.to_csv('BMI_upenn.csv')

In [None]:
dataframe.to_csv('BMI_ucsf.csv')