<a href="https://colab.research.google.com/github/ranadeepbhuyan/cancer-mri-analysis/blob/main/clinical/ideal_weight_of_patients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import necessary libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

##Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Necessary Paths

In [None]:
train_path = r"/content/drive/MyDrive/body weight prediction/heart_data.csv"

test_path = r"/content/drive/MyDrive/body weight prediction/UPENN-GBM_clinical_info_v1.1_modified_REAL - UPENN-GBM_clinical_info_v1.1_modified.csv"

test_path2 = r"/content/drive/MyDrive/body weight prediction/UCSF-PDGM-metadata_v3_modeified - UCSF-PDGM-metadata_v2_modeified.csv"

##Converted into dataframe to extracted required data

In [None]:
data_path = pd.read_csv(train_path)
data_train = pd.DataFrame(data_path)
#remove columns which are not relevent
column_to_removed = data_train.columns.tolist()
index_required = [2, 3, 5]
updated_list = [column_to_removed[i] for i in range(len(column_to_removed)) if i not in index_required]
data_train = data_train.drop(columns = updated_list)
#Change age in years instead of days and round off
data_train['age'] = (data_train['age'] / 365).round().astype(int)
data_train['weight'] = (data_train['weight']).round().astype(int)

data_train.head(20)


Unnamed: 0,age,gender,weight
0,50,2,62
1,55,1,85
2,52,1,64
3,48,2,82
4,48,1,56
5,60,1,67
6,61,1,93
7,62,2,95
8,48,1,71
9,54,1,68


## Taking the average weight of each age and each gender

## Now we each age again have 2 genders. so we have taking average weight of each age group of each gender to generalised our data set.

## But in our testing set we have age from 19 to 88 age so now we will have to insert some random weight and ages and gender in our training dataset to cover the weights of unseen data.

## Now taking out the index of each gender in each age and taking the average of each weight group and insert it to the respective index in the training dataset.

In [None]:
#function to find people of each age
def grouping( from_, to_):
  list_1 = []
  list_2 = []
  for i in range(len(data_train['age'])):
    if data_train['age'][i] >= from_  and data_train['age'][i] <= to_:
      if data_train['gender'][i] == 1:
        list_1.append(i)
      if data_train['gender'][i] == 2:
        list_2.append(i)
  return list_1, list_2

#function to calculate the average weight
def average(list):
  list_w = []
  if len(list) != 0:
    for i in list:
      weight = data_train['weight'][i]
      list_w.append(weight)
    return sum(list_w)/len(list)

#function to replace the orginal weights with average weights
def replacement(list, weight_avg, data):
  for i in list:
      data.loc[i, 'weight'] = int(round(weight_avg))

#
list3 = []
for i in range(0,36):
  list1, list2 = grouping( 30+ i, 30 + i)
  avg_weight1 = average(list1)
  avg_weight2 = average(list2)
  replacement(list1, avg_weight1, data_train)
  replacement(list2, avg_weight2, data_train)


In [None]:
data_train['weight'].min()

59

In [None]:
data_train['weight'].max()

92

## Adding new rows

In [None]:
new_rows = [{'age': 19, 'gender': 1, 'weight': 60},{'age': 88, 'gender': 1, 'weight': 63},{'age': 19, 'gender': 2, 'weight': 54},{'age': 88, 'gender': 2, 'weight': 68},
            {'age': 19, 'gender': 1, 'weight': 64},{'age': 88, 'gender': 1, 'weight': 66},{'age': 19, 'gender': 2, 'weight': 58},{'age': 88, 'gender': 2, 'weight': 48},
            {'age': 19, 'gender': 1, 'weight': 65},{'age': 88, 'gender': 1, 'weight': 56},{'age': 19, 'gender': 2, 'weight': 52},{'age': 88, 'gender': 2, 'weight': 55},
            {'age': 19, 'gender': 1, 'weight': 70},{'age': 88, 'gender': 1, 'weight': 79},{'age': 19, 'gender': 2, 'weight': 90},{'age': 88, 'gender': 2, 'weight': 50}]
data_train = data_train.append(new_rows, ignore_index=True)
data_train.tail()

  data_train = data_train.append(new_rows, ignore_index=True)


Unnamed: 0,age,gender,weight
70011,88,2,55
70012,19,1,70
70013,88,1,79
70014,19,2,90
70015,88,2,50


In [None]:
data_path = pd.read_csv(test_path2)
data_predict = pd.DataFrame(data_path)
#remove columns which are not relevent
column_to_removed = data_predict.columns.tolist()
index_required = [1, 2]
updated_list = [column_to_removed[i] for i in range(len(column_to_removed)) if i not in index_required]
data_predict = data_predict.drop(columns = updated_list)
#Replace columns
column_data = data_predict.pop('Age_at_scan_years')
data_predict.insert(0, 'Age_at_scan_years', column_data)
#Replace symbols to numbers
data_predict['Gender'] = data_predict['Gender'].replace('F', 2)
data_predict['Gender'] = data_predict['Gender'].replace('M', 1)
#Rename columns
data_predict = data_predict.rename(columns={'Age_at_scan_years': 'age'})
data_predict = data_predict.rename(columns={'Gender': 'gender'})

data_predict.head()


Unnamed: 0,age,gender
0,66,1
1,80,2
2,70,1
3,70,1
4,68,2


In [None]:
#Checking for missing values in training dataset
print(data_train.isnull().sum())


age       0
gender    0
weight    0
dtype: int64


## Spliting data into training and testing

In [None]:
X = data_train.drop(columns = ['weight'], axis = 1)
y = data_train['weight']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##Using Decision tree classifier to predict the weight

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#model training
model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

#predict the unseen data
y_predict = model.predict(data_predict)

Accuracy: 0.9342330762639246


In [None]:
y_predict.max()

78

In [None]:
#converted the predictd weights into data frame
dataframe = pd.DataFrame(y_predict)

## save the weights and add them to the Upenn clinical dataset

In [None]:
dataframe.to_csv('weight_upenn.csv')

In [None]:
dataframe.to_csv('weight_ucsf.csv')