In [1]:
# Importing dependencies
! pip install google-cloud-bigquery
! pip install psycopg2-binary
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
from config_db import db_password



In [2]:
# Setting up the environment variable
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="data.json"

In [3]:
# Initializing a BigQuery client to authenticate and connect to BigQuery API
client = bigquery.Client()

In [4]:
# StackOverflow dataset from BigQuery
hn_dataset_ref = client.dataset('stackoverflow', project='bigquery-public-data')
hn_dset = client.get_dataset(hn_dataset_ref)

In [5]:
# Exploring available data tables from StackOverflow dataset
[x.table_id for x in client.list_tables(hn_dset)]

['badges',
 'comments',
 'post_history',
 'post_links',
 'posts_answers',
 'posts_moderator_nomination',
 'posts_orphaned_tag_wiki',
 'posts_privilege_wiki',
 'posts_questions',
 'posts_tag_wiki',
 'posts_tag_wiki_excerpt',
 'posts_wiki_placeholder',
 'stackoverflow_posts',
 'tags',
 'users',
 'votes']

In [6]:
# Pull only relevant columns to ML model
query_job = client.query(
        """
        SELECT
        id, creation_date, parent_id
        FROM `bigquery-public-data.stackoverflow.posts_answers`
        WHERE creation_date > '2021-01-01'
        ORDER BY id"""
    )

results = query_job.result()  # Waits for job to complete.
    # [END bigquery_simple_app_query]

    # [START bigquery_simple_app_print]
# for row in results:
#     print(row)
results_list = [row for row in results]
cols = ['a_id', 'a_creation_date', 'q_id']
posts_answers_df = pd.DataFrame.from_records(results_list, columns=cols)
posts_answers_df

Unnamed: 0,a_id,a_creation_date,q_id
0,65526388,2021-01-01 00:00:01.653000+00:00,65525964
1,65526389,2021-01-01 00:00:11.187000+00:00,54217345
2,65526391,2021-01-01 00:00:41.780000+00:00,65526354
3,65526393,2021-01-01 00:01:25.723000+00:00,48184969
4,65526394,2021-01-01 00:01:33.950000+00:00,65525938
...,...,...,...
1427485,69061022,2021-09-05 06:21:38.487000+00:00,69060415
1427486,69061025,2021-09-05 06:22:13.693000+00:00,69059807
1427487,69061028,2021-09-05 06:22:52.787000+00:00,69059980
1427488,69061029,2021-09-05 06:23:42.780000+00:00,69060590


In [7]:
posts_answers_df.shape

(1427490, 3)

In [8]:
posts_answers_df.nunique()

a_id               1427490
a_creation_date    1427309
q_id               1092483
dtype: int64

In [9]:
posts_answers_df = posts_answers_df.set_index("a_id")

In [10]:
posts_answers_df.head()

Unnamed: 0_level_0,a_creation_date,q_id
a_id,Unnamed: 1_level_1,Unnamed: 2_level_1
65526388,2021-01-01 00:00:01.653000+00:00,65525964
65526389,2021-01-01 00:00:11.187000+00:00,54217345
65526391,2021-01-01 00:00:41.780000+00:00,65526354
65526393,2021-01-01 00:01:25.723000+00:00,48184969
65526394,2021-01-01 00:01:33.950000+00:00,65525938


In [11]:
db_string = f"postgresql://acospynamiyoti:{db_password}@ec2-44-198-223-154.compute-1.amazonaws.com:5432/d443pqekji2r98"

In [12]:
engine = create_engine(db_string)

In [13]:
# Import posts_answers_df to SQL table

posts_answers_df.to_sql(name='posts_answers', con=engine)