In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account

# Setup your Google BigQuery variables

In [2]:
data = pd.read_csv('income_per_person_inflation_adjusted.csv')

In [3]:
# CHANGE THIS TO YOUR FILE PATH 
key_path = r''

In [4]:
# run this cell without changing anything to setup your credentials
credentials = service_account.Credentials.from_service_account_file(key_path,
                                                                    scopes=["https://www.googleapis.com/auth/cloud-platform"],)
bigquery_client = bigquery.Client(credentials = credentials,
                                 project = credentials.project_id)

print(f"bigquery client name is: {bigquery_client}")
print(f"bigquery client data type is: {type(bigquery_client)}")

bigquery client name is: <google.cloud.bigquery.client.Client object at 0x0000017D360A4250>
bigquery client data type is: <class 'google.cloud.bigquery.client.Client'>


In [5]:
dataset_id = ''   # PASTE THIS DATASET ID FROM ABOVE STEPS

dataset_id = dataset_id.replace(':', '.')
print(f"your dataset_id is: {dataset_id}")

your dataset_id is: cis-9440-361019.gdp


# Data Profiling

In [6]:
# create and run a function to ceate data profiling dataframe

def create_data_profiling_df(data):
    
    # create an empty dataframe to gather information about each column
    data_profiling_df = pd.DataFrame(columns = ["column_name",
                                                "column_type",
                                                "unique_values",
                                                "duplicate_values",
                                                "null_values",
                                                "non_null_values"])

    # loop through each column to add rows to the data_profiling_df dataframe
    for column in data.columns:

        info_dict = {}

        try:
            info_dict["column_name"] = column
            info_dict["column_type"] = data[column].dtypes
            info_dict["unique_values"] = len(data[column].unique())
            info_dict["duplicate_values"] = data[column].count() - len(data[column].dropna().unique())
            info_dict["null_values"] = data[column].isna().sum()
            info_dict["non_null_values"] = data[column].count()

        except:
            print(f"unable to read column: {column}, you may want to drop this column")

        data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)

    data_profiling_df.sort_values(by = ['unique_values', "non_null_values"],
                                  ascending = [False, False],
                                  inplace=True)
    
    return data_profiling_df

In [7]:
# view your data profiling dataframe
data_profiling_df = create_data_profiling_df(data = data)
data_profiling_df

  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)

Unnamed: 0,column_name,column_type,unique_values,duplicate_values,null_values,non_null_values
0,country,object,193,0,0,193
195,1994,int64,185,8,0,193
229,2028,int64,185,8,0,193
197,1996,int64,184,9,0,193
230,2029,int64,184,9,0,193
...,...,...,...,...,...,...
111,1910,int64,160,33,0,193
127,1926,int64,160,33,0,193
151,1950,int64,160,33,0,193
63,1862,int64,159,34,0,193


<strong>The data looks good, no need to do data cleaning</strong>

# Create Country Dimension

In [8]:
#rename country column to country_name
data.rename(columns = {'country':'country_name'}, inplace = True)

In [9]:
# first, copy the entire table
CountryDim = data.copy()

In [10]:
# second, subset for only the wanted columns in the dimension
CountryDim = CountryDim[["country_name"]]

In [11]:
#third, add country id
CountryDim.insert(0, 'country_id', range(1, 1 + len(CountryDim)))

In [12]:
CountryDim.dtypes

country_id       int64
country_name    object
dtype: object

In [13]:
CountryDim

Unnamed: 0,country_id,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola
...,...,...
188,189,Venezuela
189,190,Vietnam
190,191,Yemen
191,192,Zambia


#another way to megre country id
<br>
a = data.merge(country_dim,
                  left_on = ["country"],
                  right_on=['country'],
                  how = 'left')

In [14]:
# then, add the country_id to the data table
data = pd.merge(CountryDim,data,on='country_name')
data.head(5)

Unnamed: 0,country_id,country_name,1800,1801,1802,1803,1804,1805,1806,1807,...,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040
0,1,Afghanistan,603,603,603,603,603,603,603,603,...,2550,2600,2660,2710,2770,2820,2880,2940,3000,3060
1,2,Albania,667,667,667,667,667,668,668,668,...,19400,19800,20200,20600,21000,21500,21900,22300,22800,23300
2,3,Algeria,715,716,717,718,719,720,721,722,...,14300,14600,14900,15200,15500,15800,16100,16500,16800,17100
3,4,Andorra,1200,1200,1200,1200,1210,1210,1210,1210,...,73600,75100,76700,78300,79900,81500,83100,84800,86500,88300
4,5,Angola,618,620,623,626,628,631,634,637,...,6110,6230,6350,6480,6610,6750,6880,7020,7170,7310


# Create Date Dimension

In [15]:
# first, copy the entire table
date = data.copy()

In [16]:
# save the columns as year_list
year_list=date.columns.tolist()

In [17]:
#remove country_id and country which are not related to date
del year_list[0:2]

In [18]:
#change list to dataframe
DateDim = pd.DataFrame (year_list, columns = ['year'])

In [19]:
#change the data type of year column  as integer
DateDim['year'] = DateDim['year'].astype(np.int64)

In [20]:
#create date_id
DateDim.insert(0, 'date_id', range(1800, 1800 + len(DateDim)))

In [21]:
#calcuate decade by using year-(year%10)
DateDim['decade']=DateDim.apply(lambda x: x['year']-(x['year']%10), axis=1)

In [22]:
DateDim

Unnamed: 0,date_id,year,decade
0,1800,1800,1800
1,1801,1801,1800
2,1802,1802,1800
3,1803,1803,1800
4,1804,1804,1800
...,...,...,...
236,2036,2036,2030
237,2037,2037,2030
238,2038,2038,2030
239,2039,2039,2030


In [23]:
DateDim.dtypes

date_id    int64
year       int64
decade     int64
dtype: object

# Change Fact Table

In [24]:
#use melt function to reshape data from  wide format to long format
GDP_fact=data.melt(id_vars=["country_id", "country_name"], 
        var_name="date_id", 
        value_name="income_per_person")

In [25]:
#change the data type of date_id column as integer
GDP_fact['date_id'] = GDP_fact['date_id'].astype(np.int64)

In [26]:
#drop the country_name column
GDP_fact=GDP_fact.drop("country_name",axis=1)

In [27]:
GDP_fact

Unnamed: 0,country_id,date_id,income_per_person
0,1,1800,603
1,2,1800,667
2,3,1800,715
3,4,1800,1200
4,5,1800,618
...,...,...,...
46508,189,2040,9880
46509,190,2040,14400
46510,191,2040,3870
46511,192,2040,4180


In [28]:
GDP_fact.dtypes

country_id           int64
date_id              int64
income_per_person    int64
dtype: object

# Deliver Facts and Dimensions to Data Warehouse (BigQuery)

In [29]:
# create a function to load dataframes to BigQuery

def load_table_to_bigquery(df,
                          table_name,
                          dataset_id):

    dataset_id = dataset_id #change 301800 to match your project id

    dataset_ref = bigquery_client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    job_config.autodetect = True
    job_config.write_disposition = "WRITE_TRUNCATE"

    upload_table_name = f"{dataset_id}.{table_name}"
    
    load_job = bigquery_client.load_table_from_dataframe(df,
                                                upload_table_name,
                                                job_config = job_config)
        
    print(f"completed job {load_job}")

In [30]:
#load GDP_fact to bigquery and name it as gdp_fact in BigQuery
load_table_to_bigquery(df = GDP_fact,
                       table_name = "gdp_fact",
                       dataset_id = dataset_id)

completed job LoadJob<project=cis-9440-361019, location=US, id=77b649db-d24e-4d8a-9d7e-ddc1f4043e73>


In [31]:
#load DateDim to bigquery and name it as date_dim in BigQuery
load_table_to_bigquery(df = DateDim,
                       table_name = "date_dim",
                       dataset_id = dataset_id)

completed job LoadJob<project=cis-9440-361019, location=US, id=99d0db90-4804-4371-b0cd-6ef3baf9206d>


In [32]:
#load CountryDim to bigquery and name it as country_dim in BigQuery
load_table_to_bigquery(df = CountryDim,
                       table_name = "country_dim",
                       dataset_id = dataset_id)

completed job LoadJob<project=cis-9440-361019, location=US, id=c6cfd507-ddbc-4a71-ad04-f3b90003497c>


In [33]:
GDP_fact.to_csv("GDP_fact.csv",index=False)

In [34]:
DateDim.to_csv("DateDim.csv",index=False)

In [35]:
CountryDim.to_csv("CountryDim.csv",index=False)