In [1]:
import pandas as pd
from sqlalchemy import create_engine
import sqlalchemy as sa
import os

In [2]:
connection_url = f"mysql+pymysql://{os.environ['TEST_DB_USERNAME']}:{os.environ['TEST_DB_PASSWORD']}@{os.environ['TEST_DB_HOSTNAME']}/{os.environ['TEST_DB_DATABASE_NAME']}"
db_engine = create_engine(connection_url)

try:
    with db_engine.connect() as connection:
        print("Connection to MySQL database successful!")
except Exception as e:
    print(f"Error: {e}")

Connection to MySQL database successful!


In [3]:
df = pd.read_sql("SELECT * FROM lung_cancer", db_engine)

In [4]:
df.shape

(838216, 19)

-------------

In [5]:
treatments = df['treatment_type'].unique().tolist()
treatment_df = pd.DataFrame(treatments, columns=['treatment_type'])

In [6]:
treatment_df

Unnamed: 0,treatment_type
0,Combined
1,Radiation
2,Surgery
3,Chemotherapy


In [7]:
treatment_df['treatment_type'].value_counts()

treatment_type
Combined        1
Radiation       1
Surgery         1
Chemotherapy    1
Name: count, dtype: int64

-------

In [8]:
cancer_stage = df[['id', 'cancer_stage']]

In [9]:
stage_ids = {
    'Stage I': 's1',
    'Stage II': 's2',
    'Stage III': 's3',
    'Stage IV': 's4',
}

cancer_stage['cancer_stage_id'] = cancer_stage['cancer_stage'].map(stage_ids)

cancer_stage

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cancer_stage['cancer_stage_id'] = cancer_stage['cancer_stage'].map(stage_ids)


Unnamed: 0,id,cancer_stage,cancer_stage_id
0,20,Stage IV,s4
1,30,Stage IV,s4
2,33,Stage I,s1
3,46,Stage III,s3
4,54,Stage I,s1
...,...,...,...
838211,3249934,Stage IV,s4
838212,3249952,Stage I,s1
838213,3249962,Stage III,s3
838214,3249987,Stage I,s1


------------

In [10]:
diagnosis_date = pd.read_sql("SELECT DISTINCT diagnosis_date FROM lung_cancer", con=db_engine)
end_treatment_date = pd.read_sql("SELECT DISTINCT end_treatment_date FROM lung_cancer", con=db_engine)

In [11]:
diagnosis_date['diagnosis_date'] = pd.to_datetime(diagnosis_date['diagnosis_date'])
end_treatment_date['end_treatment_date'] = pd.to_datetime(end_treatment_date['end_treatment_date'])

In [12]:
diagnosis_date['date'] = diagnosis_date['diagnosis_date'].dt.strftime('%Y-%m')
end_treatment_date['date'] = end_treatment_date['end_treatment_date'].dt.strftime('%Y-%m')

In [16]:
diagnosis_date['diagnosis_date']

0     2014-07-01
1     2014-07-26
2     2014-10-11
3     2014-08-20
4     2014-08-27
         ...    
937   2016-10-02
938   2016-03-17
939   2016-03-01
940   2016-03-28
941   2016-09-25
Name: diagnosis_date, Length: 942, dtype: datetime64[ns]

In [13]:
dates = pd.concat([diagnosis_date['date'],end_treatment_date['date']], axis=0)

In [14]:
unique_dates = dates.unique().tolist()
dates_df = pd.DataFrame(unique_dates, columns=['dates'])

In [15]:
dates_df

Unnamed: 0,dates
0,2014-07
1,2014-10
2,2014-08
3,2014-11
4,2014-12
5,2014-09
6,2014-06
7,2015-08
8,2015-07
9,2015-11
