In [2]:
import pandas as pd
from sqlalchemy import create_engine
import sqlalchemy as sa
import os

In [3]:
connection_url = f"mysql+pymysql://{os.environ['TEST_DB_USERNAME']}:{os.environ['TEST_DB_PASSWORD']}@{os.environ['TEST_DB_HOSTNAME']}/{os.environ['TEST_DB_DATABASE_NAME']}"
db_engine = create_engine(connection_url)

In [4]:
try:
    with db_engine.connect() as connection:
        print("Connection to MySQL database successful!")
except Exception as e:
    print(f"Error: {e}")

Connection to MySQL database successful!


In [38]:
df = pd.read_sql("SELECT * FROM lung_cancer", con=db_engine)
df.shape

(838216, 19)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 838216 entries, 0 to 838215
Data columns (total 19 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           838216 non-null  int64  
 1   age                          838216 non-null  float64
 2   gender                       838216 non-null  object 
 3   country                      838216 non-null  object 
 4   diagnosis_date               838216 non-null  object 
 5   cancer_stage                 838216 non-null  object 
 6   beginning_of_treatment_date  838216 non-null  object 
 7   family_history               838216 non-null  object 
 8   smoking_status               838216 non-null  object 
 9   bmi                          838216 non-null  float64
 10  cholesterol_level            838216 non-null  int64  
 11  hypertension                 838216 non-null  int64  
 12  asthma                       838216 non-null  int64  
 13 

In [40]:
wh_lung_1 = df.loc[df['year'] == 2014]

In [41]:
wh_lung_1['year'].value_counts()

year
2014    187482
Name: count, dtype: int64

In [42]:
wh_lung_1 = wh_lung_1.drop(columns=['beginning_of_treatment_date', 'family_history', 'smoking_status', 'bmi', 'cholesterol_level', 'other_cancer', 'year'])

In [43]:
wh_lung_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 187482 entries, 0 to 187481
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  187482 non-null  int64  
 1   age                 187482 non-null  float64
 2   gender              187482 non-null  object 
 3   country             187482 non-null  object 
 4   diagnosis_date      187482 non-null  object 
 5   cancer_stage        187482 non-null  object 
 6   hypertension        187482 non-null  int64  
 7   asthma              187482 non-null  int64  
 8   cirrhosis           187482 non-null  int64  
 9   treatment_type      187482 non-null  object 
 10  end_treatment_date  187482 non-null  object 
 11  survived            187482 non-null  int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 18.6+ MB


In [44]:
wh_lung_1['diagnosis_date'] = pd.to_datetime(wh_lung_1['diagnosis_date'])
wh_lung_1['end_treatment_date'] = pd.to_datetime(wh_lung_1['end_treatment_date'])

wh_lung_1['diagnosis_date'] = wh_lung_1['diagnosis_date'].dt.strftime('%Y-%m')
wh_lung_1['end_treatment_date'] = wh_lung_1['end_treatment_date'].dt.strftime('%Y-%m')

wh_lung_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 187482 entries, 0 to 187481
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  187482 non-null  int64  
 1   age                 187482 non-null  float64
 2   gender              187482 non-null  object 
 3   country             187482 non-null  object 
 4   diagnosis_date      187482 non-null  object 
 5   cancer_stage        187482 non-null  object 
 6   hypertension        187482 non-null  int64  
 7   asthma              187482 non-null  int64  
 8   cirrhosis           187482 non-null  int64  
 9   treatment_type      187482 non-null  object 
 10  end_treatment_date  187482 non-null  object 
 11  survived            187482 non-null  int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 18.6+ MB


In [45]:
wh_lung_1['age'] = wh_lung_1['age'].astype(int)

In [46]:
country_codes = {
    "Latvia": "LV",
    "Malta": "MT",
    "Cyprus": "CY",
    "Denmark": "DK",
    "Greece": "GR",
    "Italy": "IT",
    "Belgium": "BE",
    "Czech Republic": "CZ",
    "Croatia": "HR",
    "Sweden": "SE",
    "Estonia": "EE",
    "Germany": "DE",
    "Finland": "FI",
    "Lithuania": "LT",
    "Spain": "ES",
    "Luxembourg": "LU",
    "Bulgaria": "BG",
    "Poland": "PL",
    "Romania": "RO",
    "Austria": "AT",
    "Slovakia": "SK",
    "Netherlands": "NL",
    "Ireland": "IE",
    "France": "FR",
    "Hungary": "HU",
    "Portugal": "PT",
    "Slovenia": "SI"
}

wh_lung_1['CountryCode'] = wh_lung_1['country'].map(country_codes)

In [47]:
wh_lung_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 187482 entries, 0 to 187481
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  187482 non-null  int64 
 1   age                 187482 non-null  int32 
 2   gender              187482 non-null  object
 3   country             187482 non-null  object
 4   diagnosis_date      187482 non-null  object
 5   cancer_stage        187482 non-null  object
 6   hypertension        187482 non-null  int64 
 7   asthma              187482 non-null  int64 
 8   cirrhosis           187482 non-null  int64 
 9   treatment_type      187482 non-null  object
 10  end_treatment_date  187482 non-null  object
 11  survived            187482 non-null  int64 
 12  CountryCode         187482 non-null  object
dtypes: int32(1), int64(5), object(7)
memory usage: 19.3+ MB


------------------------------------------

In [5]:
air_data = pd.read_parquet("..\\..\\0.Data\\European Enviroment Agency Air Data\\per month mean\\2014.parquet")
air_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 838216 entries, 0 to 838215
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  838216 non-null  object
 1   age                 838216 non-null  object
 2   gender              838216 non-null  object
 3   country             838216 non-null  object
 4   diagnosis_date      838216 non-null  object
 5   end_treatment_date  838216 non-null  object
 6   cancer_stage        838216 non-null  object
 7   hypertension        838216 non-null  object
 8   asthma              838216 non-null  object
 9   cirrhosis           838216 non-null  object
 10  treatment_type      838216 non-null  object
 11  survived            838216 non-null  object
dtypes: object(12)
memory usage: 76.7+ MB


In [6]:
air_data['diagnosis_date'][0]

'diagnosis_date'

In [49]:
pivot_df_2014 = air_data.pivot_table(index=['CountryCode', 'year_month'], columns='PollutantName', values='Value').reset_index()
pivot_df_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324 entries, 0 to 323
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype    
---  ------       --------------  -----    
 0   CountryCode  324 non-null    object   
 1   year_month   324 non-null    period[M]
 2   NO2          282 non-null    float64  
 3   O3           255 non-null    float64  
 4   PM10         312 non-null    float64  
 5   PM2.5        299 non-null    float64  
 6   SO2          276 non-null    float64  
dtypes: float64(5), object(1), period[M](1)
memory usage: 17.8+ KB


In [50]:
if 'BaP' in pivot_df_2014.columns:
    pivot_df_2014 = pivot_df_2014.drop(columns=['BaP'])
else:
    print('Column BaP not present already')

Column BaP not present already


In [51]:
def calculate_aqi(concentration, breakpoints):
    for bp in breakpoints:
        if bp['low_conc'] <= concentration <= bp['high_conc']:
            aqi = ((bp['high_aqi'] - bp['low_aqi']) / (bp['high_conc'] - bp['low_conc'])) * (concentration - bp['low_conc']) + bp['low_aqi']
            return aqi
    return None  # If concentration is outside defined ranges

# Define the European AQI breakpoints for each pollutant

# O3 (8-hour maximum) - European Breakpoints
o3_breakpoints = [
    {'low_conc': 0, 'high_conc': 60, 'low_aqi': 0, 'high_aqi': 50},
    {'low_conc': 61, 'high_conc': 120, 'low_aqi': 51, 'high_aqi': 100},
    {'low_conc': 121, 'high_conc': 180, 'low_aqi': 101, 'high_aqi': 150},
    {'low_conc': 181, 'high_conc': 240, 'low_aqi': 151, 'high_aqi': 200},
    {'low_conc': 241, 'high_conc': 300, 'low_aqi': 201, 'high_aqi': 300},
    # Add more breakpoints as needed
]

# PM10 (24-hour) - European Breakpoints
pm10_breakpoints = [
    {'low_conc': 0, 'high_conc': 20, 'low_aqi': 0, 'high_aqi': 50},
    {'low_conc': 21, 'high_conc': 35, 'low_aqi': 51, 'high_aqi': 100},
    {'low_conc': 36, 'high_conc': 50, 'low_aqi': 101, 'high_aqi': 150},
    {'low_conc': 51, 'high_conc': 100, 'low_aqi': 151, 'high_aqi': 200},
    # Add more breakpoints as needed
]

# PM2.5 (24-hour) - European Breakpoints
pm25_breakpoints = [
    {'low_conc': 0, 'high_conc': 10, 'low_aqi': 0, 'high_aqi': 50},
    {'low_conc': 11, 'high_conc': 20, 'low_aqi': 51, 'high_aqi': 100},
    {'low_conc': 21, 'high_conc': 25, 'low_aqi': 101, 'high_aqi': 150},
    {'low_conc': 26, 'high_conc': 50, 'low_aqi': 151, 'high_aqi': 200},
    # Add more breakpoints as needed
]

# NO2 (1-hour) - European Breakpoints
no2_breakpoints = [
    {'low_conc': 0, 'high_conc': 40, 'low_aqi': 0, 'high_aqi': 50},
    {'low_conc': 41, 'high_conc': 90, 'low_aqi': 51, 'high_aqi': 100},
    {'low_conc': 91, 'high_conc': 120, 'low_aqi': 101, 'high_aqi': 150},
    {'low_conc': 121, 'high_conc': 230, 'low_aqi': 151, 'high_aqi': 200},
    # Add more breakpoints as needed
]

# SO2 (1-hour) - European Breakpoints
so2_breakpoints = [
    {'low_conc': 0, 'high_conc': 100, 'low_aqi': 0, 'high_aqi': 50},
    {'low_conc': 101, 'high_conc': 200, 'low_aqi': 51, 'high_aqi': 100},
    {'low_conc': 201, 'high_conc': 350, 'low_aqi': 101, 'high_aqi': 150},
    {'low_conc': 351, 'high_conc': 500, 'low_aqi': 151, 'high_aqi': 200},
    # Add more breakpoints as needed
]


# Calculate AQI for each pollutant
pivot_df_2014['AQI_O3'] = pivot_df_2014['O3'].apply(calculate_aqi, breakpoints=o3_breakpoints)
pivot_df_2014['AQI_PM10'] = pivot_df_2014['PM10'].apply(calculate_aqi, breakpoints=pm10_breakpoints)
pivot_df_2014['AQI_PM2.5'] = pivot_df_2014['PM2.5'].apply(calculate_aqi, breakpoints=pm25_breakpoints)
pivot_df_2014['AQI_NO2'] = pivot_df_2014['NO2'].apply(calculate_aqi, breakpoints=no2_breakpoints)
pivot_df_2014['AQI_SO2'] = pivot_df_2014['SO2'].apply(calculate_aqi, breakpoints=so2_breakpoints)

# Get the maximum AQI for each row
pivot_df_2014['Max_AQI'] = pivot_df_2014[['AQI_O3', 'AQI_PM10', 'AQI_PM2.5', 'AQI_NO2', 'AQI_SO2']].max(axis=1)

pivot_df_2014

PollutantName,CountryCode,year_month,NO2,O3,PM10,PM2.5,SO2,AQI_O3,AQI_PM10,AQI_PM2.5,AQI_NO2,AQI_SO2,Max_AQI
0,AT,2014-01,29.231075,25.134155,25.188877,22.421975,4.114651,20.945129,65.661070,118.419188,36.538843,2.057326,118.419188
1,AT,2014-02,27.898785,41.782436,21.319890,16.914925,3.967511,34.818697,52.119614,83.203481,34.873481,1.983755,83.203481
2,AT,2014-03,25.692867,58.045856,25.386487,21.047230,3.838447,48.371547,66.352705,101.578570,32.116084,1.919224,101.578570
3,AT,2014-04,19.954027,63.748873,19.402508,14.717699,2.820726,53.282962,48.506271,71.240808,24.942533,1.410363,71.240808
4,AT,2014-05,14.772853,71.470304,12.695772,7.020212,2.283651,59.695676,31.739430,35.101062,18.466067,1.141826,59.695676
...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,SK,2014-08,23.830122,65.405811,17.571161,11.341726,5.865641,54.659064,43.927903,52.860508,29.787653,2.932821,54.659064
320,SK,2014-09,30.501350,56.172629,25.769819,16.631649,6.628430,46.810524,67.694368,81.661198,38.126688,3.314215,81.661198
321,SK,2014-10,24.890726,42.558965,29.673672,20.395317,7.817794,35.465804,81.357853,,31.113408,3.908897,81.357853
322,SK,2014-11,18.138012,37.293543,36.716843,29.018406,8.584744,31.077953,103.508949,157.162579,22.672515,4.292372,157.162579


In [52]:
def classify_air_quality(aqi):
    if aqi <= 50:
        return 'Good'
    elif 51 <= aqi <= 100:
        return 'Moderate'
    elif 101 <= aqi <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif 151 <= aqi <= 200:
        return 'Unhealthy'
    elif 201 <= aqi <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'
    
pivot_df_2014['Air_Quality'] = pivot_df_2014['Max_AQI'].apply(classify_air_quality)
pivot_df_2014

PollutantName,CountryCode,year_month,NO2,O3,PM10,PM2.5,SO2,AQI_O3,AQI_PM10,AQI_PM2.5,AQI_NO2,AQI_SO2,Max_AQI,Air_Quality
0,AT,2014-01,29.231075,25.134155,25.188877,22.421975,4.114651,20.945129,65.661070,118.419188,36.538843,2.057326,118.419188,Unhealthy for Sensitive Groups
1,AT,2014-02,27.898785,41.782436,21.319890,16.914925,3.967511,34.818697,52.119614,83.203481,34.873481,1.983755,83.203481,Moderate
2,AT,2014-03,25.692867,58.045856,25.386487,21.047230,3.838447,48.371547,66.352705,101.578570,32.116084,1.919224,101.578570,Unhealthy for Sensitive Groups
3,AT,2014-04,19.954027,63.748873,19.402508,14.717699,2.820726,53.282962,48.506271,71.240808,24.942533,1.410363,71.240808,Moderate
4,AT,2014-05,14.772853,71.470304,12.695772,7.020212,2.283651,59.695676,31.739430,35.101062,18.466067,1.141826,59.695676,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,SK,2014-08,23.830122,65.405811,17.571161,11.341726,5.865641,54.659064,43.927903,52.860508,29.787653,2.932821,54.659064,Moderate
320,SK,2014-09,30.501350,56.172629,25.769819,16.631649,6.628430,46.810524,67.694368,81.661198,38.126688,3.314215,81.661198,Moderate
321,SK,2014-10,24.890726,42.558965,29.673672,20.395317,7.817794,35.465804,81.357853,,31.113408,3.908897,81.357853,Moderate
322,SK,2014-11,18.138012,37.293543,36.716843,29.018406,8.584744,31.077953,103.508949,157.162579,22.672515,4.292372,157.162579,Unhealthy


In [63]:
pivot_df_2014 = pivot_df_2014.drop(['NO2', 'O3', 'PM10', 'PM2.5', 'SO2'], axis=1)

PollutantName,CountryCode,year_month,AQI_O3,AQI_PM10,AQI_PM2.5,AQI_NO2,AQI_SO2,Max_AQI,Air_Quality
0,AT,2014-01,20.945129,65.661070,118.419188,36.538843,2.057326,118.419188,Unhealthy for Sensitive Groups
1,AT,2014-02,34.818697,52.119614,83.203481,34.873481,1.983755,83.203481,Moderate
2,AT,2014-03,48.371547,66.352705,101.578570,32.116084,1.919224,101.578570,Unhealthy for Sensitive Groups
3,AT,2014-04,53.282962,48.506271,71.240808,24.942533,1.410363,71.240808,Moderate
4,AT,2014-05,59.695676,31.739430,35.101062,18.466067,1.141826,59.695676,Moderate
...,...,...,...,...,...,...,...,...,...
319,SK,2014-08,54.659064,43.927903,52.860508,29.787653,2.932821,54.659064,Moderate
320,SK,2014-09,46.810524,67.694368,81.661198,38.126688,3.314215,81.661198,Moderate
321,SK,2014-10,35.465804,81.357853,,31.113408,3.908897,81.357853,Moderate
322,SK,2014-11,31.077953,103.508949,157.162579,22.672515,4.292372,157.162579,Unhealthy


In [66]:
pivot_df_2014['year_month'] = pivot_df_2014['year_month'].astype(str)

In [68]:
merged_df = wh_lung_1.merge(pivot_df_2014, left_on=['CountryCode', 'diagnosis_date'], right_on=['CountryCode', 'year_month'], how='inner')
merged_df

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,hypertension,asthma,cirrhosis,treatment_type,...,survived,CountryCode,year_month,AQI_O3,AQI_PM10,AQI_PM2.5,AQI_NO2,AQI_SO2,Max_AQI,Air_Quality
0,20,56,Male,Lithuania,2014-07,Stage IV,1,0,0,Combined,...,0,LT,2014-07,44.699282,70.292333,39.800000,22.557303,0.936812,70.292333,Moderate
1,30,36,Male,Italy,2014-07,Stage IV,1,0,0,Combined,...,1,IT,2014-07,59.572171,44.930715,46.968776,22.093366,2.030453,59.572171,Moderate
2,33,61,Male,Denmark,2014-10,Stage I,1,0,0,Radiation,...,0,DK,2014-10,,62.570372,77.522894,27.127633,0.851263,77.522894,Moderate
3,46,56,Male,Sweden,2014-08,Stage III,0,0,0,Combined,...,0,SE,2014-08,48.417723,31.988215,31.541102,27.102650,0.610605,48.417723,Good
4,54,51,Male,Austria,2014-08,Stage I,1,1,0,Combined,...,0,AT,2014-08,45.687438,27.843433,29.641690,18.575604,1.154375,45.687438,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187477,3249964,65,Female,Hungary,2014-10,Stage II,0,0,0,Combined,...,0,HU,2014-10,,80.019868,164.854167,5.228226,0.760274,164.854167,Unhealthy
187478,3249970,43,Male,Netherlands,2014-10,Stage III,1,1,0,Surgery,...,0,NL,2014-10,19.011403,49.288220,49.662521,45.004958,2.968721,49.662521,Good
187479,3249972,66,Female,Czech Republic,2014-10,Stage III,0,1,0,Combined,...,0,CZ,2014-10,22.786742,71.710212,130.945211,27.717802,2.848693,130.945211,Unhealthy for Sensitive Groups
187480,3249976,48,Male,Cyprus,2014-07,Stage IV,0,0,0,Surgery,...,1,CY,2014-07,,102.840323,99.622401,,,102.840323,Unhealthy for Sensitive Groups


In [73]:
final_table = merged_df[['id', 'age', 'gender', 'country', 'diagnosis_date', 'end_treatment_date', 'cancer_stage', 'hypertension', 'asthma', 'cirrhosis', 'treatment_type', 'survived', 'AQI_PM2.5', 'AQI_PM10',  'AQI_NO2', 'AQI_SO2', 'AQI_O3', 'Max_AQI', 'Air_Quality']]

In [74]:
final_table

Unnamed: 0,id,age,gender,country,diagnosis_date,end_treatment_date,cancer_stage,hypertension,asthma,cirrhosis,treatment_type,survived,AQI_PM2.5,AQI_PM10,AQI_NO2,AQI_SO2,AQI_O3,Max_AQI,Air_Quality
0,20,56,Male,Lithuania,2014-07,2015-08,Stage IV,1,0,0,Combined,0,39.800000,70.292333,22.557303,0.936812,44.699282,70.292333,Moderate
1,30,36,Male,Italy,2014-07,2015-12,Stage IV,1,0,0,Combined,1,46.968776,44.930715,22.093366,2.030453,59.572171,59.572171,Moderate
2,33,61,Male,Denmark,2014-10,2015-10,Stage I,1,0,0,Radiation,0,77.522894,62.570372,27.127633,0.851263,,77.522894,Moderate
3,46,56,Male,Sweden,2014-08,2016-03,Stage III,0,0,0,Combined,0,31.541102,31.988215,27.102650,0.610605,48.417723,48.417723,Good
4,54,51,Male,Austria,2014-08,2016-01,Stage I,1,1,0,Combined,0,29.641690,27.843433,18.575604,1.154375,45.687438,45.687438,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187477,3249964,65,Female,Hungary,2014-10,2016-01,Stage II,0,0,0,Combined,0,164.854167,80.019868,5.228226,0.760274,,164.854167,Unhealthy
187478,3249970,43,Male,Netherlands,2014-10,2015-07,Stage III,1,1,0,Surgery,0,49.662521,49.288220,45.004958,2.968721,19.011403,49.662521,Good
187479,3249972,66,Female,Czech Republic,2014-10,2016-02,Stage III,0,1,0,Combined,0,130.945211,71.710212,27.717802,2.848693,22.786742,130.945211,Unhealthy for Sensitive Groups
187480,3249976,48,Male,Cyprus,2014-07,2015-01,Stage IV,0,0,0,Surgery,1,99.622401,102.840323,,,,102.840323,Unhealthy for Sensitive Groups


In [75]:
final_table = final_table.rename({'id': 'patient_id', 
                                  'age':'patient_age', 
                                  'gender' : 'patient_gender',
                                  'AQI_PM2.5': 'pm25_index',
                                  'AQI_PM10': 'pm10_index',
                                  'AQI_NO2': 'no2_index',
                                  'AQI_SO2': 'so2_index',
                                  'AQI_O3': 'o3_index',
                                  'Max_AQI': 'air_quality_index',
                                  'Air_Quality': 'air_quality'
                                  })

In [76]:
final_table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 187482 entries, 0 to 187481
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  187482 non-null  int64  
 1   age                 187482 non-null  int32  
 2   gender              187482 non-null  object 
 3   country             187482 non-null  object 
 4   diagnosis_date      187482 non-null  object 
 5   end_treatment_date  187482 non-null  object 
 6   cancer_stage        187482 non-null  object 
 7   hypertension        187482 non-null  int64  
 8   asthma              187482 non-null  int64  
 9   cirrhosis           187482 non-null  int64  
 10  treatment_type      187482 non-null  object 
 11  survived            187482 non-null  int64  
 12  AQI_PM2.5           155926 non-null  float64
 13  AQI_PM10            169693 non-null  float64
 14  AQI_NO2             163347 non-null  float64
 15  AQI_SO2             157744 non-null  fl