In [50]:
# Import the necessary library we need for our data preparation: pandas package:
import pandas as pd

In [51]:
# Read in csv file containing data from consumer survey:
survey = pd.read_csv('data/consumer_survey.csv')

In [52]:
# Show dataframe:
survey

Unnamed: 0,question,group,item,agreement_percent
0,Current frequency of buying organic foods,Age 14-29,Exclusively,3
1,Current frequency of buying organic foods,Age 14-29,Frequently,30
2,Current frequency of buying organic foods,Age 14-29,Occasionally,51
3,Current frequency of buying organic foods,Age 14-29,Never,16
4,Current frequency of buying organic foods,Age 30-49,Exclusively,3
...,...,...,...,...
103,Reasons for buying organic foods,Total,No use of genetically changed organisms,74
104,Reasons for buying organic foods,Total,Products as natural as possible,90
105,Reasons for buying organic foods,Total,Regional origin and support for regional firms,88
106,Reasons for buying organic foods,Total,Social standards and fair income for producers,79


In [53]:
# Show dataframe info:
survey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   question           108 non-null    object
 1   group              108 non-null    object
 2   item               108 non-null    object
 3   agreement_percent  108 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 3.5+ KB


In [54]:
# We have no null values in our dataframe.

In [55]:
# Add variable for item category

# Define function that assigns item category:
import numpy as np
def get_item_category(item):
    if item == 'Animal welfare':
        return 'Animal ethics concerns'
    elif item == 'Avoidance of pesticide residues' or item == 'Fewer additive substances' or item == 'Healthy diet' or item == 'No use of genetically changed organisms' or item == 'Products as natural as possible':
        return 'Health concerns'
    elif item == 'Contribution to biodiversity':
        return 'Environmental concerns'
    elif item == 'Regional origin and support for regional firms':
        return 'Preference for regional production'
    elif item == 'Social standards and fair income for producers':
        return 'Social concerns'
    elif item == 'Taste':
        return 'Taste preferences'
    else:
        return np.nan
    
# Create variable for item category, based on item:
survey['item_category'] = survey['item'].apply(get_item_category)

# Check results:
survey

Unnamed: 0,question,group,item,agreement_percent,item_category
0,Current frequency of buying organic foods,Age 14-29,Exclusively,3,
1,Current frequency of buying organic foods,Age 14-29,Frequently,30,
2,Current frequency of buying organic foods,Age 14-29,Occasionally,51,
3,Current frequency of buying organic foods,Age 14-29,Never,16,
4,Current frequency of buying organic foods,Age 30-49,Exclusively,3,
...,...,...,...,...,...
103,Reasons for buying organic foods,Total,No use of genetically changed organisms,74,Health concerns
104,Reasons for buying organic foods,Total,Products as natural as possible,90,Health concerns
105,Reasons for buying organic foods,Total,Regional origin and support for regional firms,88,Preference for regional production
106,Reasons for buying organic foods,Total,Social standards and fair income for producers,79,Social concerns


In [56]:
# We now want to upload the dataframe to the database on the server.

In [57]:
# Import sql_functions.py because we need some functions from that module:
import sql_functions as sqlf

# We need to restart the kernel and rerun at this point if we changed the module since we first imported it.

In [58]:
# Create a variable called engine using the get_engine function:
engine = sqlf.get_engine()

In [59]:
# We set the schema to our course name:
schema = 'capstone_organicfood'

# We set the table_name variable to our group name + the name of the dataframe:
table_name = 'consumer_survey'

In [60]:
# We need psycopg2 for raising possible error message:
import psycopg2

In [61]:
# Write records stored in the dataframe to SQL database:
if engine!=None:
    try:
        survey.to_sql(name=table_name, # name of SQL table variable
                        con=engine, # engine or connection
                        schema=schema, # our class schema variable
                        if_exists='replace', # Drop the table before inserting new values
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')

The consumer_survey table was imported successfully.


In [62]:
# Test: query the newly created table to count the rows (we know from above that the dataframe has 108 cases):
sqlf.get_dataframe(f'SELECT COUNT(*) FROM {schema}.consumer_survey;')

Unnamed: 0,count
0,108


In [63]:
# Worked!