In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/global-cancer-patients-2015-2024/global_cancer_patients_2015_2024.csv


In [2]:
# 1. Import relevant libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor 
from sklearn.ensemble import RandomForestRegressor # Benchmarking bagging model for comparison of performance
from sklearn.preprocessing import LabelEncoder

# 2. Read the .csv file 

df_cancer = pd.read_csv('/kaggle/input/global-cancer-patients-2015-2024/global_cancer_patients_2015_2024.csv') 

print(df_cancer.head(10)) # Print the first 10 rows in the data frame

print(df_cancer.describe()) # Print high level summary stats: Mean, Std, Min, Max, Quartiles

# 3. Formulate the key hypotheses: 
# 1. Which of the factors contribute the most to Cancer (Target_Severity_Score) ?
# 2. What combination of factors give us the best accuracy of Cancer prediction ?

# 4. Define X and y variables

X = df_cancer[['Age','Year','Genetic_Risk','Air_Pollution','Alcohol_Use','Smoking','Obesity_Level']]
              # 'Treatment_Cost_USD','Survival_Years','Cancer_Type','Cancer_Stage'

y = df_cancer[['Target_Severity_Score']]

# Initial input variable understanding: As there are no categorical variables,XGBoost & LightGBM will be most suited algorithms

# 5. Introduce categorical variables: Alcohol use (1/0) | Smoking (1/0) | Age groups/ Demographics (20-30|30-40|40-50|50+)


     # Create categorical variables for age groups: 20-30, 30-40, 50+ years | Make sure the categories are mutually exclusive!

def categorize_age(age_series):
    age_group = []
    for age in age_series:
        if 20 <= age <= 30:
            age_group.append('20-30')
        elif 31 <= age <= 40:
            age_group.append('30-40')
        elif 41 <= age <= 50:
            age_group.append('40-50')
        elif 51 <= age <= 100:
            age_group.append('50+')
        else:
            age_group.append('Other') #Handle erroneous data or Null values
    return age_group

df_cancer['Age Group'] = categorize_age(df_cancer['Age'])

print(df_cancer.head(10))


    # Create categorical variables for last 2years, 2-5 yrs, and 5+ years | Make sure the categories are mutually exclusive

def categorize_year(year_series):
    year_group = []
    for year in year_series:
        if 2022 <= year <= 2024:
            year_group.append('Last 2 years')
        elif 2019 <= year < 2022:
            year_group.append('2-5 years')
        elif year < 2019:
            year_group.append('5+ years')
        else:
            year_group.append('Other') #Handle erroneous data or Null values
    return year_group

df_cancer['Year Group'] = categorize_year(df_cancer['Year'])

print(df_cancer[['Year','Year Group']].head(20))


# 6. Replace null values with mean for numerical variables so the distribution of the data is not changing. 

    # By inspecting the distribution closely with the help of 25%, 50%, and 75% quartiles, it is a symmetrical distribution

for col in df_cancer.select_dtypes(include=['number']).columns:
    df_cancer[col]=df_cancer[col].fillna(df_cancer[col].mean())

# 7. Replace null values with mean for categorical variables with a string: 'unknown'

for col in df_cancer.select_dtypes(include=['string']).columns:
    df_cancer[col]=df_cancer[col].fillna('unknown')

print(df_cancer.head(10))

# 8. Separate X variables in to numerical and categorical variables

X_numerical = df_cancer[['Genetic_Risk','Air_Pollution','Alcohol_Use','Smoking','Obesity_Level']]

# 'Genetic_Risk','Air_Pollution','Alcohol_Use','Smoking','Obesity_Level']]

X_categorical = df_cancer[['Age Group']]

y = df_cancer[['Target_Severity_Score']] # Predictor variable can be Cancer stage/ Severity whichever makes best sense for the business use case


# 9. Apply Label Encoding to the categorical variables

label_encoders = {} # Initiating the Encoder
for feature in X_categorical:
    X_categorical[feature] = LabelEncoder().fit_transform(X_categorical[feature])


# 10. Combine numerical and categorical variables

X = pd.concat([X_numerical, X_categorical], axis=1)

# Split the datasets in to train and test samples: 80% train | 20% test

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)


# 11. Initiate and train the Extreme Gradient Boost Regressor model

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', # train method
                       n_estimators= 100, # no. of iterations
                       learning_rate= 0.1,
                        max_depth= 3,
                        random_state=42)

xgb_model.fit(X_train,y_train)


#       #Initiate and train the light Gradient Boost Regressor model

# lgbm_model = lgb.LGBMRegressor(objective='regression_l2', # train method
#                         n_estimators= 100, # no. of iterations
#                         learning_rate= 0.1,
#                         max_depth= 3,
#                         random_state=42)
# lgbm_model.fit(X_train,y_train)


     #Initiate and train the Cat Gradient Boost Regressor model

# catboost_model = CatBoostRegressor(objective='RMSE', # train method
#                         iterations= 100, # no. of iterations
#                         learning_rate= 0.1,
#                         depth= 3, # equivalent to max depth
#                         random_seed=42, # equivalent to random state
#                         verbose=0)
# catboost_model.fit(X_train,y_train)


# 12. Model prediction on test sample

y_pred=xgb_model.predict(X_test)

# 13. Model evaluation to understand the best performing out of lightGBM, XGBoost, CatBoost

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error:{rmse}")

mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error:{mae}")

r_squared = r2_score(y_test, y_pred)
print(f"R squared:{r_squared}")


  Patient_ID  Age  Gender Country_Region  Year  Genetic_Risk  Air_Pollution  \
0  PT0000000   71    Male             UK  2021           6.4            2.8   
1  PT0000001   34    Male          China  2021           1.3            4.5   
2  PT0000002   80    Male       Pakistan  2023           7.4            7.9   
3  PT0000003   40    Male             UK  2015           1.7            2.9   
4  PT0000004   43  Female         Brazil  2017           5.1            2.8   
5  PT0000005   22    Male        Germany  2018           9.5            6.4   
6  PT0000006   41    Male         Canada  2021           5.1            8.2   
7  PT0000007   72  Female         Canada  2018           6.0            8.2   
8  PT0000008   21    Male            USA  2022           4.3            3.8   
9  PT0000009   49  Female         Canada  2016           8.1            0.8   

   Alcohol_Use  Smoking  Obesity_Level Cancer_Type Cancer_Stage  \
0          9.5      0.9            8.7        Lung    Stage III

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[feature] = LabelEncoder().fit_transform(X_categorical[feature])
