In [None]:
import mysql.connector
import pandas as pd
import boto3
import json
import sqlalchemy

In [None]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [None]:
creds = get_secret("wysde")
USERNAME = creds["RDS_MYSQL_USERNAME"]
PASSWORD = creds["RDS_MYSQL_PASSWORD"]
HOST = creds["RDS_MYSQL_HOST"]
DATABASE = 'sparsh'

In [None]:
conn_str = 'mysql+mysqlconnector://{0}:{1}@{2}/{3}'.format(USERNAME, PASSWORD, HOST, DATABASE)
conn = sqlalchemy.create_engine(conn_str)

In [None]:
%config SqlMagic.autopandas=True
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False
%reload_ext sql
%sql {conn_str}
# %sql GRANT SESSION_VARIABLES_ADMIN ON *.* TO {USERNAME};

---

## us_housing_units

In [None]:
df_us_housing_units = pd.read_csv("./data/us_housing_units.csv")
df_us_housing_units

In [None]:
print(pd.io.sql.get_schema(df_us_housing_units.reset_index(),
                           name='us_housing_units',
                           con=conn))

In [None]:
# df_us_housing_units.to_sql('us_housing_units', conn, if_exists='replace')

In [None]:
%%sql

CREATE TABLE IF NOT EXISTS us_housing_units (
	year FLOAT(53), 
	month FLOAT(53), 
	month_name TEXT, 
	south FLOAT(53), 
	west FLOAT(53), 
	midwest FLOAT(53), 
	northeast FLOAT(53)
)

Follow https://go.aws/3CbS9Z4

In [None]:
!mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=',' {DATABASE} data/us_housing_units.csv 

In [None]:
%sql select * from us_housing_units limit 10;

---

## billboard_top_100_year_end

In [None]:
TABLE = "billboard_top_100_year_end"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE, con=conn))

In [None]:
%%sql

CREATE TABLE billboard_top_100_year_end (
	year FLOAT(53), 
	year_rank FLOAT(53), 
	`group` TEXT, 
	artist TEXT, 
	song_name TEXT, 
	id BIGINT
)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE}.csv 

---

## aapl_historical_stock_price

In [None]:
TABLE = "aapl_historical_stock_price"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE, con=conn))

In [None]:
%%sql

CREATE TABLE aapl_historical_stock_price (
	date TEXT, 
	year FLOAT(53), 
	month FLOAT(53), 
	open FLOAT(53), 
	high FLOAT(53), 
	low FLOAT(53), 
	close FLOAT(53), 
	volume FLOAT(53), 
	id BIGINT
)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE}.csv 

---

## college_football_players

In [None]:
TABLE = "college_football_players"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE, con=conn))

In [None]:
%%sql

CREATE TABLE college_football_players (
	full_school_name TEXT, 
	school_name TEXT, 
	player_name TEXT, 
	position TEXT, 
	height FLOAT(53), 
	weight FLOAT(53), 
	year TEXT, 
	hometown TEXT, 
	state TEXT, 
	id BIGINT
)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE}.csv 

---

## college_football_teams

In [None]:
TABLE = "college_football_teams"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE, con=conn))

In [None]:
%%sql

CREATE TABLE college_football_teams (
	division TEXT, 
	conference TEXT, 
	school_name TEXT, 
	roster_url TEXT, 
	id BIGINT
)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE}.csv 

---

## crunchbase_companies

In [None]:
TABLE = "crunchbase_companies"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE, con=conn))

In [None]:
%%sql

CREATE TABLE crunchbase_companies (
	permalink TEXT, 
	name TEXT, 
	homepage_url TEXT, 
	category_code TEXT, 
	funding_total_usd FLOAT(53), 
	status TEXT, 
	country_code TEXT, 
	state_code TEXT, 
	region TEXT, 
	city TEXT, 
	funding_rounds FLOAT(53), 
	founded_at TEXT, 
	founded_month TEXT, 
	founded_quarter TEXT, 
	founded_year FLOAT(53), 
	first_funding_at TEXT, 
	last_funding_at TEXT, 
	last_milestone_at TEXT, 
	id BIGINT
)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE}.csv 

---

## crunchbase_acquisitions

In [None]:
TABLE = "crunchbase_acquisitions"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE, con=conn))

In [None]:
%%sql

CREATE TABLE crunchbase_acquisitions (
	company_permalink TEXT, 
	company_name TEXT, 
	company_category_code TEXT, 
	company_country_code TEXT, 
	company_state_code TEXT, 
	company_region TEXT, 
	company_city TEXT, 
	acquirer_permalink TEXT, 
	acquirer_name TEXT, 
	acquirer_category_code TEXT, 
	acquirer_country_code TEXT, 
	acquirer_state_code TEXT, 
	acquirer_region TEXT, 
	acquirer_city TEXT, 
	acquired_at TEXT, 
	acquired_month TEXT, 
	acquired_quarter TEXT, 
	acquired_year FLOAT(53), 
	price_amount FLOAT(53), 
	price_currency_code TEXT, 
	id BIGINT
)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE}.csv 

---

## crunchbase_investments_part1

In [None]:
TABLE = "crunchbase_investments_part1"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE, con=conn))

In [None]:
%%sql


CREATE TABLE crunchbase_investments_part1 (
	company_permalink TEXT, 
	company_name TEXT, 
	company_category_code TEXT, 
	company_country_code TEXT, 
	company_state_code TEXT, 
	company_region TEXT, 
	company_city TEXT, 
	investor_permalink TEXT, 
	investor_name TEXT, 
	investor_category_code TEXT, 
	investor_country_code TEXT, 
	investor_state_code TEXT, 
	investor_region TEXT, 
	investor_city TEXT, 
	funding_round_type TEXT, 
	funded_at TEXT, 
	funded_month TEXT, 
	funded_quarter TEXT, 
	funded_year FLOAT(53), 
	raised_amount_usd FLOAT(53), 
	id BIGINT
)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE}.csv 

---

## crunchbase_companies_clean_date

In [None]:
TABLE = "crunchbase_companies_clean_date"
TABLE_NEW = TABLE+"_preprocessed"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
df.info()

In [None]:
df['founded_at_clean'] = pd.to_datetime(df['founded_at_clean'])
df.info()

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE_NEW, con=conn))

In [None]:
%%sql

CREATE TABLE crunchbase_companies_clean_date_preprocessed (
	permalink TEXT, 
	name TEXT, 
	homepage_url TEXT, 
	category_code TEXT, 
	funding_total_usd FLOAT(53), 
	status TEXT, 
	country_code TEXT, 
	state_code TEXT, 
	region TEXT, 
	city TEXT, 
	funding_rounds FLOAT(53), 
	founded_at TEXT, 
	founded_at_clean DATETIME, 
	id BIGINT
)

In [None]:
df.to_csv(f"./data/{TABLE_NEW}.csv", index=False)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE_NEW}.csv 

---

## crunchbase_acquisitions_clean_date

In [None]:
TABLE = "crunchbase_acquisitions_clean_date"
TABLE_NEW = TABLE+"_preprocessed"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
df.info()

In [None]:
df['acquired_at_cleaned'] = pd.to_datetime(df['acquired_at_cleaned'])
df.info()

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE_NEW, con=conn))

In [None]:
%%sql

CREATE TABLE crunchbase_acquisitions_clean_date_preprocessed (
	company_permalink TEXT, 
	acquirer_permalink TEXT, 
	acquirer_name TEXT, 
	acquirer_category_code TEXT, 
	acquirer_country_code TEXT, 
	acquirer_state_code TEXT, 
	acquirer_region TEXT, 
	acquirer_city TEXT, 
	price_amount FLOAT(53), 
	price_currency_code TEXT, 
	acquired_at TEXT, 
	acquired_at_cleaned DATETIME, 
	id BIGINT
)

In [None]:
df.to_csv(f"./data/{TABLE_NEW}.csv", index=False)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE_NEW}.csv

---

## sf_crime_incidents_2014_01

In [None]:
TABLE = "sf_crime_incidents_2014_01"
TABLE_NEW = TABLE+"_preprocessed"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE_NEW, con=conn))

In [None]:
%%sql

CREATE TABLE sf_crime_incidents_2014_01_preprocessed (
	incidnt_num FLOAT(53), 
	category TEXT, 
	descript TEXT, 
	day_of_week TEXT, 
	date TIMESTAMP NULL, 
	time TEXT, 
	pd_district TEXT, 
	resolution TEXT, 
	address TEXT, 
	lon FLOAT(53), 
	lat FLOAT(53), 
	location TEXT, 
	id BIGINT
)

In [None]:
df.to_csv(f"./data/{TABLE_NEW}.csv", index=False)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE_NEW}.csv

---

## dc_bikeshare_q1_2012

In [None]:
TABLE = "dc_bikeshare_q1_2012"
TABLE_NEW = TABLE+"_preprocessed"

df = pd.read_csv(f"./data/{TABLE}.csv")
df.head()

In [None]:
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

In [None]:
print(pd.io.sql.get_schema(df, name=TABLE_NEW, con=conn))

In [None]:
%%sql

CREATE TABLE dc_bikeshare_q1_2012_preprocessed (
	duration TEXT, 
	duration_seconds FLOAT(53), 
	start_time DATETIME, 
	start_station TEXT, 
	start_terminal FLOAT(53), 
	end_time DATETIME, 
	end_station TEXT, 
	end_terminal FLOAT(53), 
	bike_number TEXT, 
	rider_type TEXT, 
	id BIGINT
)

In [None]:
df.to_csv(f"./data/{TABLE_NEW}.csv", index=False)

In [None]:
!echo mysqlimport --local \
    --compress \
    --user={USERNAME} \
    --password \
    --host={HOST} \
    --ignore-lines=1 \
    --fields-terminated-by=\',\' {DATABASE} data/{TABLE_NEW}.csv