# Lung Cancer Dataset:

-------------------------------------------------------

## Opening the Data:

In [13]:
# import statement:

import pandas as pd
import numpy as np

In [14]:
# reading the csv:

df = pd.read_csv("../0.Data/lung_cancer_synthesized.csv")

In [15]:
# looking at a random sample:

df.sample(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,beginning_of_treatment_date,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
2858075,2858075,2858075,2184060,38.0,Male,France,2023-08-08,Stage II,2023-08-09,Yes,Current Smoker,29.5,177,1,0,0,0,Radiation,2024-11-27,0
831804,831804,831804,314002,58.0,Male,Spain,2017-03-02,Stage IV,2017-03-06,Yes,Current Smoker,16.4,171,1,0,0,0,Chemotherapy,2017-11-05,0
1447662,1447662,1447662,1361647,57.0,Male,Netherlands,2019-03-10,Stage I,2019-03-26,No,Passive Smoker,37.7,283,1,0,0,0,Combined,2020-05-10,1
795160,795160,795160,2321783,44.0,Female,Lithuania,2017-01-16,Stage IV,2017-01-21,No,Former Smoker,32.1,270,1,0,0,0,Chemotherapy,2018-04-12,0
1852140,1852140,1852140,2343756,56.0,Female,Spain,2020-06-23,Stage IV,2020-06-28,Yes,Never Smoked,20.6,157,0,0,0,0,Combined,2021-03-10,0


## General Information:

In [16]:
# seeing how many rows and columns we have in the dataset

df.shape

(3123586, 20)

In [17]:
# getting some information about the columns in the dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3123586 entries, 0 to 3123585
Data columns (total 20 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Unnamed: 0.1                 int64  
 1   Unnamed: 0                   int64  
 2   id                           int64  
 3   age                          float64
 4   gender                       object 
 5   country                      object 
 6   diagnosis_date               object 
 7   cancer_stage                 object 
 8   beginning_of_treatment_date  object 
 9   family_history               object 
 10  smoking_status               object 
 11  bmi                          float64
 12  cholesterol_level            int64  
 13  hypertension                 int64  
 14  asthma                       int64  
 15  cirrhosis                    int64  
 16  other_cancer                 int64  
 17  treatment_type               object 
 18  end_treatment_date           object 
 19  

## Cleaning the Data:

Let us see if we have any duplicated values and null values:

In [18]:
# checking for duplicates:

df.duplicated().sum()

0

In [19]:
# checking for nulls

df.isna().sum()

Unnamed: 0.1                   0
Unnamed: 0                     0
id                             0
age                            0
gender                         0
country                        0
diagnosis_date                 0
cancer_stage                   0
beginning_of_treatment_date    0
family_history                 0
smoking_status                 0
bmi                            0
cholesterol_level              0
hypertension                   0
asthma                         0
cirrhosis                      0
other_cancer                   0
treatment_type                 0
end_treatment_date             0
survived                       0
dtype: int64

## Fast Transformations:

In [20]:
# getting the year of the diagnosis from the dataset and seeing the distribution of the data over the years:

df['diagnosis_date'] = pd.to_datetime(df['diagnosis_date'])
df['year'] = df['diagnosis_date'].dt.year
df['year'].value_counts()

year
2022    325498
2021    324990
2023    324467
2019    317969
2015    310774
2018    307012
2020    305953
2016    304854
2017    298011
2014    167221
2024    136837
Name: count, dtype: int64

In [21]:
df['year_month'] = df['diagnosis_date'].dt.strftime('%Y-%m')
df['year_month']

0          2014-06
1          2014-06
2          2014-06
3          2014-06
4          2014-06
            ...   
3123581    2024-06
3123582    2024-06
3123583    2024-06
3123584    2024-06
3123585    2024-06
Name: year_month, Length: 3123586, dtype: object

In [22]:
df['end_treatment_date'] = pd.to_datetime(df['end_treatment_date'])
df['year_etd'] = df['end_treatment_date'].dt.year
df['year_etd'].value_counts()

year_etd
2023    325004
2024    324850
2022    319605
2020    314421
2021    310183
2017    305616
2019    304774
2018    300294
2016    291609
2025    204928
2015    107426
2026     14443
2014       433
Name: count, dtype: int64

In [26]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'age', 'gender', 'country',
       'diagnosis_date', 'cancer_stage', 'beginning_of_treatment_date',
       'family_history', 'smoking_status', 'bmi', 'cholesterol_level',
       'hypertension', 'asthma', 'cirrhosis', 'other_cancer', 'treatment_type',
       'end_treatment_date', 'survived', 'year', 'year_month', 'year_etd'],
      dtype='object')

In [27]:
df = df[['id', 'age', 'gender', 'country', 'diagnosis_date', 'cancer_stage', 'beginning_of_treatment_date', 'family_history', 'smoking_status', 'bmi', 'cholesterol_level', 'hypertension', 'asthma', 'cirrhosis', 'other_cancer', 'treatment_type',
       'end_treatment_date', 'survived', 'year']]

In [28]:
len(df.columns)

19

In [23]:
# df.to_csv("lung_cancer.csv")

I will be partitioning the data by years so I can load it incrementally in the warehouse!

In [29]:
df[df['year'] == 2014].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2014.csv", index=False)
df[df['year'] == 2015].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2015.csv", index=False)
df[df['year'] == 2016].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2016.csv", index=False)
df[df['year'] == 2017].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2017.csv", index=False)
df[df['year'] == 2018].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2018.csv", index=False)
df[df['year'] == 2019].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2019.csv", index=False)
df[df['year'] == 2020].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2020.csv", index=False)
df[df['year'] == 2021].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2021.csv", index=False)
df[df['year'] == 2022].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2022.csv", index=False)
df[df['year'] == 2023].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2023.csv", index=False)
df[df['year'] == 2024].to_csv("..\\0.1.Data Used\\cancer data\\cancer_2024.csv", index=False)