**_<div style="text-align: center"> - - -   TEAM 4 | SMU Data Science Bootcamp   - - -</div>_**

In [1]:
# Import Dependencies
import pandas as pd
from sqlalchemy import create_engine
import psycopg2

### Extract CSVs into DataFrames 

In [2]:
#Extract CSV into DataFrame
overdose_file = "overdoses.csv"
overdose_file_df = pd.read_csv(overdose_file)
overdose_file_df.head()

Unnamed: 0,State,Population,Deaths,Abbrev
0,Alabama,4833722,723,AL
1,Alaska,735132,124,AK
2,Arizona,6626624,1211,AZ
3,Arkansas,2959373,356,AR
4,California,38332521,4521,CA


In [3]:
#Extract CSV into DataFrame
prescriber_info_file = "prescriber-info.csv"
prescriber_info_file_df_org = pd.read_csv(prescriber_info_file)
prescriber_info_file_df_org.head()

Unnamed: 0,NPI,Gender,State,Credentials,Specialty,ABILIFY,ACETAMINOPHEN.CODEINE,ACYCLOVIR,ADVAIR.DISKUS,AGGRENOX,...,VERAPAMIL.ER,VESICARE,VOLTAREN,VYTORIN,WARFARIN.SODIUM,XARELTO,ZETIA,ZIPRASIDONE.HCL,ZOLPIDEM.TARTRATE,Opioid.Prescriber
0,1710982582,M,TX,DDS,Dentist,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1245278100,F,AL,MD,General Surgery,0,0,0,0,0,...,0,0,0,0,0,0,0,0,35,1
2,1427182161,F,NY,M.D.,General Practice,0,0,0,0,0,...,0,0,0,0,0,0,0,0,25,0
3,1669567541,M,AZ,MD,Internal Medicine,0,43,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1679650949,M,NV,M.D.,Hematology/Oncology,0,0,0,0,0,...,0,0,0,0,17,28,0,0,0,1


### Clean DataFrames
#### Select the columns 
All the columns in overdoses_file_df is kept.

In [4]:
# Create new data with select columns
prescriber_info_file_df = prescriber_info_file_df_org[['State','Gender', 'Specialty']]
prescriber_info_file_df.head()

Unnamed: 0,State,Gender,Specialty
0,TX,M,Dentist
1,AL,F,General Surgery
2,NY,F,General Practice
3,AZ,M,Internal Medicine
4,NV,M,Hematology/Oncology


#### Check for duplicate values in the DataFrames

In [5]:
#Check for the shape of the DataFrame
overdose_file_df.shape

(50, 4)

In [6]:
prescriber_info_file_df.shape

(25000, 3)

In [7]:
#Check for duplicates
overdose_file_df = overdose_file_df.drop_duplicates(subset=None, keep='first', inplace= False)

#### Check for null values in the DataFrames

In [8]:
#Check for null values
overdose_file_df.isnull().values.any()

False

In [9]:
#Check for null values
prescriber_info_file_df.isnull().values.any()

False

In [10]:
overdose_file_df.head()

Unnamed: 0,State,Population,Deaths,Abbrev
0,Alabama,4833722,723,AL
1,Alaska,735132,124,AK
2,Arizona,6626624,1211,AZ
3,Arkansas,2959373,356,AR
4,California,38332521,4521,CA


In [11]:
prescriber_info_file_df.head()

Unnamed: 0,State,Gender,Specialty
0,TX,M,Dentist
1,AL,F,General Surgery
2,NY,F,General Practice
3,AZ,M,Internal Medicine
4,NV,M,Hematology/Oncology


#### Change Column names for the DataFrames
Since "State" is a key in SQL and gives error

In [12]:
overdose_file_df_renamed = overdose_file_df.rename(columns={'State': 'states', 'Population': 'population', 'Deaths': 'deaths', 'Abbrev': 'abbrev'})

In [13]:
prescriber_info_file_df_renamed = prescriber_info_file_df.rename(columns={'State': 'states', 'Gender': 'gender', 'Specialty': 'specialty' })

### Connect to local database

In [14]:
# Import postgreSQL password
from config import postgres_pass

# Use your own username and password in the following code accordingly
# rds_connection_string = "<insert user name>:<insert password>@localhost:5432/customer_db"

rds_connection_string = (f"postgres:{postgres_pass}@localhost:5432/Team4_ETL_db")
engine = create_engine(f'postgresql://{rds_connection_string}')

### Check for tables

In [15]:
engine.table_names()

['overdose', 'prescriber']

### Use pandas to load csv converted DataFrame into database

In [16]:
overdose_file_df_renamed.to_sql(name='overdose', con=engine, if_exists='append', index=False)

In [17]:
prescriber_info_file_df_renamed

Unnamed: 0,states,gender,specialty
0,TX,M,Dentist
1,AL,F,General Surgery
2,NY,F,General Practice
3,AZ,M,Internal Medicine
4,NV,M,Hematology/Oncology
...,...,...,...
24995,WA,F,Family Practice
24996,MI,F,Internal Medicine
24997,AZ,M,Emergency Medicine
24998,IN,F,Family Practice


In [18]:
prescriber_info_file_df_renamed.to_sql(name='prescriber', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the customer_name table

In [19]:
pd.read_sql_query('select * from overdose', con=engine).head()

Unnamed: 0,states,population,deaths,abbrev
0,Alabama,4833722,723,AL
1,Alaska,735132,124,AK
2,Arizona,6626624,1211,AZ
3,Arkansas,2959373,356,AR
4,California,38332521,4521,CA


In [20]:
pd.read_sql_query('select * from prescriber', con=engine).head()

Unnamed: 0,states,gender,specialty
0,TX,M,Dentist
1,AL,F,General Surgery
2,NY,F,General Practice
3,AZ,M,Internal Medicine
4,NV,M,Hematology/Oncology
