# Uploading data into PostgreSQL

**Goal: Load prepared training sets for each campaign section containing meta features, the normalized text, and additional data from the Web Robots database. Next, append the training sets to their respective PostgreSQL databases.**

In [1]:
# Load required libraries
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
from sklearn.externals import joblib

We'll be adding data into two distinct databases, `section1` and `section2`&mdash;one for each campaign section.

In [2]:
# Set Postgres credentials
db_name1 = 'section1_db'
db_name2 = 'section2_db'
usernm = 'redwan'
host = 'localhost'
port = '5432'
# pwd = '***'

Next, let's create a SQLAlchemy engine for both database connections.

In [3]:
# Create engines for both databases
engine1 = create_engine(
    'postgresql://{}:{}@{}:{}/{}'.format(usernm, pwd, host, port, db_name1)
)

engine2 = create_engine(
    'postgresql://{}:{}@{}:{}/{}'.format(usernm, pwd, host, port, db_name2)
)

We'll create a database only if it doesn't yet exist.

In [4]:
# Create a new database for each section if it already does not exist
if not database_exists(engine1.url):
    create_database(engine1.url)

if not database_exists(engine2.url):
    create_database(engine2.url)

# Display whether the database exists
print(database_exists(engine1.url), database_exists(engine2.url) )

True True


Let's load the prepared training sets for each campaign section.

In [5]:
# Load DataFrames from pickle files
section1_df = joblib.load(
    'data/extracted_data/section1_all_features_20000-24558.pkl'
)

section2_df = joblib.load(
    'data/extracted_data/section2_all_features_20000-24558.pkl'
)

Next, let's upload and append the tables containing the training sets into their respective SQL databases.

In [6]:
# Append data into the corresponding SQL database
section1_df.to_sql(
    name=db_name1, 
    con=engine1,
    if_exists='append'
)

section2_df.to_sql(
    name=db_name2, 
    con=engine2,
    if_exists='append'
)

Finally, let's examine if the uploading was successful by querying the entire SQL database for a campaign section.

In [12]:
# Connect to a database
con1 = psycopg2.connect(
    database=db_name1, 
    host='localhost',
    user=usernm,
    password=pwd
)

# Define a SQL query to load a section
sql_query = """
SELECT * 
  FROM section1_db;
"""

# Perform SQL query and store results in a DataFrame
test_data_from_sql = pd.read_sql_query(sql_query, con1)

# Display the first five rows
test_data_from_sql.tail()

Unnamed: 0,level_0,index,num_sents,num_words,num_all_caps,percent_all_caps,num_exclms,percent_exclms,num_apple_words,percent_apple_words,...,percent_bolded,normalized_text,name,category,hyperlink,currency,pledged,goal,location,funded
24553,4554,160083,,,,,,,,,...,,,The Life After (PTSD Poetry),Poetry,https://www.kickstarter.com/projects/clay12b/p...,USD,200.0,10000.0,"Cincinnati, OH",False
24554,4555,83744,3.0,67.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Biggest obstacle is people being skeptical of ...,Protein Pacs. SAY GOOD-BYE to Annoying TUBS!!!!!,Drinks,https://www.kickstarter.com/projects/519818862...,USD,2.0,12000.0,"San Marcos, TX",False
24555,4556,92726,5.0,97.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,"The largest, and almost single entity, that is...",Billionaire Chronicles: A Family Affair That N...,Mobile Games,https://www.kickstarter.com/projects/171479360...,USD,956.0,30000.0,"Roseville, CA",False
24556,4557,126291,13.0,212.0,0.0,0.0,2.0,0.009434,0.0,0.0,...,0.0,"We have a great event location, a really solid...",Preserving Jazz...,Jazz,https://www.kickstarter.com/projects/347067171...,USD,0.0,5000.0,"Birmingham, AL",False
24557,4558,128592,,,,,,,,,...,,,Bucket of Honey's First Full Length Album,Pop,https://www.kickstarter.com/projects/213894853...,USD,5342.32,5000.0,"Seattle, WA",True


We can read in the data from the database. Let's see how many entires this database has.

In [8]:
# Display the number of entries in the dataset
len(test_data_from_sql)

24558

Finally, let's examine the database information.

In [9]:
# Display DataFrame information
test_data_from_sql.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24558 entries, 0 to 24557
Data columns (total 30 columns):
level_0                    24558 non-null int64
index                      24558 non-null int64
num_sents                  24527 non-null float64
num_words                  24444 non-null float64
num_all_caps               24527 non-null float64
percent_all_caps           24444 non-null float64
num_exclms                 24527 non-null float64
percent_exclms             24444 non-null float64
num_apple_words            24527 non-null float64
percent_apple_words        24444 non-null float64
avg_words_per_sent         24447 non-null float64
num_paragraphs             24527 non-null float64
avg_sents_per_paragraph    24116 non-null float64
avg_words_per_paragraph    24116 non-null float64
num_images                 24527 non-null float64
num_videos                 24527 non-null float64
num_youtubes               24527 non-null float64
num_gifs                   24527 non-null flo