# Data Analysis on survival outcomes consider risk factors and treatment effection for Pancreatic cancer patients

### Summary: 
#### This dataset offers a rich resource for researchers investigating pancreatic cancer. 
#### It enables the development and evaluation of predictive models for survival outcomes, 
#### the exploration of risk factors and treatment effectiveness, and the assessment of healthcare disparities.

### Load python libraries

In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [8]:
df = pd.read_csv('/Users/osmanrahman/Documents/GitHub/Pancreatic-Cancer-Prediction/pancreatic_cancer_prediction_sample.csv')

In [12]:
df.head()

Unnamed: 0,Country,Age,Gender,Smoking_History,Obesity,Diabetes,Chronic_Pancreatitis,Family_History,Hereditary_Condition,Jaundice,...,Stage_at_Diagnosis,Survival_Time_Months,Treatment_Type,Survival_Status,Alcohol_Consumption,Physical_Activity_Level,Diet_Processed_Food,Access_to_Healthcare,Urban_vs_Rural,Economic_Status
0,Canada,64,Female,0,0,0,0,0,0,0,...,Stage III,13,Surgery,0,0,Medium,Low,High,Urban,Low
1,South Africa,77,Male,1,1,0,0,0,0,0,...,Stage III,13,Chemotherapy,0,1,Medium,Medium,Medium,Urban,Low
2,India,71,Female,0,0,0,0,0,0,0,...,Stage IV,3,Chemotherapy,1,0,Medium,High,Low,Rural,Middle
3,Germany,56,Male,0,0,0,0,1,0,1,...,Stage IV,6,Radiation,0,1,Low,Low,Medium,Rural,Middle
4,United States,82,Female,0,0,0,0,1,0,0,...,Stage IV,9,Chemotherapy,1,0,Low,Medium,Medium,Rural,Low


In [10]:
df.describe()

Unnamed: 0,Age,Smoking_History,Obesity,Diabetes,Chronic_Pancreatitis,Family_History,Hereditary_Condition,Jaundice,Abdominal_Discomfort,Back_Pain,Weight_Loss,Development_of_Type2_Diabetes,Survival_Time_Months,Survival_Status,Alcohol_Consumption
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,64.54094,0.29954,0.24826,0.19998,0.0993,0.15168,0.04944,0.19922,0.2965,0.25286,0.34998,0.19622,13.89804,0.12844,0.30346
std,9.973847,0.458061,0.432008,0.399989,0.299067,0.358714,0.216787,0.399418,0.456719,0.434656,0.476968,0.397141,11.272151,0.334582,0.459757
min,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
50%,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
75%,71.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,19.0,0.0,1.0
max,90.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,59.0,1.0,1.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Country                        50000 non-null  object
 1   Age                            50000 non-null  int64 
 2   Gender                         50000 non-null  object
 3   Smoking_History                50000 non-null  int64 
 4   Obesity                        50000 non-null  int64 
 5   Diabetes                       50000 non-null  int64 
 6   Chronic_Pancreatitis           50000 non-null  int64 
 7   Family_History                 50000 non-null  int64 
 8   Hereditary_Condition           50000 non-null  int64 
 9   Jaundice                       50000 non-null  int64 
 10  Abdominal_Discomfort           50000 non-null  int64 
 11  Back_Pain                      50000 non-null  int64 
 12  Weight_Loss                    50000 non-null  int64 
 13  D

In [22]:
categorical_columns = df.select_dtypes(include='object').columns
print(categorical_columns)

Index(['Country', 'Gender', 'Stage_at_Diagnosis', 'Treatment_Type',
       'Physical_Activity_Level', 'Diet_Processed_Food',
       'Access_to_Healthcare', 'Urban_vs_Rural', 'Economic_Status'],
      dtype='object')


In [42]:
numerical_columns = df.select_dtypes(include='int64').columns
print(numerical_columns)

Index(['Age', 'Smoking_History', 'Obesity', 'Diabetes', 'Chronic_Pancreatitis',
       'Family_History', 'Hereditary_Condition', 'Jaundice',
       'Abdominal_Discomfort', 'Back_Pain', 'Weight_Loss',
       'Development_of_Type2_Diabetes', 'Survival_Time_Months',
       'Survival_Status', 'Alcohol_Consumption'],
      dtype='object')


In [34]:
df.isnull().sum()

Country                          0
Age                              0
Gender                           0
Smoking_History                  0
Obesity                          0
Diabetes                         0
Chronic_Pancreatitis             0
Family_History                   0
Hereditary_Condition             0
Jaundice                         0
Abdominal_Discomfort             0
Back_Pain                        0
Weight_Loss                      0
Development_of_Type2_Diabetes    0
Stage_at_Diagnosis               0
Survival_Time_Months             0
Treatment_Type                   0
Survival_Status                  0
Alcohol_Consumption              0
Physical_Activity_Level          0
Diet_Processed_Food              0
Access_to_Healthcare             0
Urban_vs_Rural                   0
Economic_Status                  0
dtype: int64

In [40]:
for col in categorical_columns:
    unique_values = df[col].unique()
    print(f'{df[col].value_counts()}\n')

Country
United States     17608
India              7542
Germany            4996
United Kingdom     4970
China              4952
Brazil             2507
Australia          2480
Canada             2475
South Africa       2470
Name: count, dtype: int64

Gender
Male      25962
Female    24038
Name: count, dtype: int64

Stage_at_Diagnosis
Stage IV     19922
Stage III    14968
Stage II     10173
Stage I       4937
Name: count, dtype: int64

Treatment_Type
Chemotherapy    24910
Radiation       15130
Surgery          9960
Name: count, dtype: int64

Physical_Activity_Level
Medium    20038
Low       20001
High       9961
Name: count, dtype: int64

Diet_Processed_Food
Medium    20122
Low       15002
High      14876
Name: count, dtype: int64

Access_to_Healthcare
Medium    25268
High      14839
Low        9893
Name: count, dtype: int64

Urban_vs_Rural
Urban    35003
Rural    14997
Name: count, dtype: int64

Economic_Status
Middle    24881
Low       15127
High       9992
Name: count, dtype: int64



In [None]:
figsize = plt.