## Get going with Postgresql

In [181]:
import psycopg2
import pandas as pd 

#### Load a DB object with `connect` and `cursor`  
first create a Database connection object 

In [187]:
def create_connection(database, password, host="localhost",port="5432", user="postgres"):
    """"
    Description: This function create a postgresql connection object
    @ input params: database name,  password, and default values
    @ output params: create a connection
    """

    connection = None
    try:
        print('Connecting to the postgreSQL database ...')
        connection = psycopg2.connect(
            host=host, 
            port=port,
            database=database,
            user=user,
            password=password
        )
    except(Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if connection is not None:
            # connection.close()
            print('Database connection created.')
    
    return connection

In [188]:
conn = create_connection("users", "postgres")

Connecting to the postgreSQL database ...
Database connection created.


In [191]:
def load_data(connection, sql_query, values=None):
    """"
    Description: This function runs the data pull from postgresql and save it as a dataframe 
    @ input params: postgresql connection object, sql query to run 
    @ output params: return a dataframe
    """
    try:
        # create a cursor 
        cursor = connection.cursor()
        cursor.execute(sql_query, values) # None
        df = pd.DataFrame(cursor.fetchall())
        df.columns = [x[0] for x in cursor.description]
        cursor.close()
        return df
    except(Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if connection is not None:
            connection.commit()
            connection.close()
            print('Database connection terminated.')

    # return df 

In [206]:
schema = "public"
table_name = "aws"
sql_select = "SELECT COUNT(*) FROM {}.{}".format(schema, table_name) # public.aws
conn = create_connection("users", "postgres") ## just create the connection to postgresql 
psg_df = load_data(connection=conn, sql_query=sql_select)

Connecting to the postgreSQL database ...
Database connection created.
Database connection terminated.


In [209]:
psg_df['count'][0]

14

#### SQL `SELECT`

In [193]:
schema = "public"
table_name = "aws"
sql_select = "SELECT * FROM {}.{}".format(schema, table_name) # public.aws
values = None 
conn = create_connection("users", "postgres") ## just create the connection to postgresql 
psg_df = load_data(connection=conn, sql_query=sql_select, values=values) # this loads the data using connection and sql query 

Connecting to the postgreSQL database ...
Database connection created.
Database connection terminated.


In [194]:
print("Shape: ", psg_df.shape)
psg_df.head(10)

Shape:  (13, 4)


Unnamed: 0,article_id,author_id,viewer_id,view_date
0,1,3,5,2019-08-01
1,1,3,6,2019-08-02
2,2,7,6,2019-08-02
3,3,4,4,2019-07-21
4,3,4,4,2019-07-21
5,5,5,5,2019-07-24
6,1,1,3,2019-08-01
7,1,1,3,2019-08-01
8,10,11,12,2022-08-07
9,10,11,12,2022-08-07


#### SQL `INSERT`

In [201]:
schema = "public"
table_name = "aws"
sql_insert = """
    INSERT INTO {}.{} (article_id, author_id, viewer_id, view_date) 
        VALUES (%s,%s,%s,%s)""".format(schema,table_name)
values = (47,11, 23,'2022-08-01')
conn = create_connection("users", "postgres")
psg_df = load_data(connection=conn, sql_query=sql_insert, values=values)

Connecting to the postgreSQL database ...
Database connection created.
no results to fetch
Database connection terminated.


In [196]:
## this will throw error -- why???
print("Shape: ", psg_df.shape)
psg_df.head(10)

AttributeError: 'NoneType' object has no attribute 'shape'

#### SQL `DELETE`

In [198]:
schema = "public"
table_name = "aws"
sql_delete = "DELETE FROM {}.{} WHERE viewer_id = 23".format(schema, table_name)
values = None
conn = create_connection("users", "postgres")
psg_df = load_data(connection=conn, sql_query=sql_delete, values=values)

Connecting to the postgreSQL database ...
Database connection created.
no results to fetch
Database connection terminated.


In [None]:
## this will throw error -- why???
print("Shape: ", psg_df)
psg_df.head(10)

comming back to the sql select

In [None]:
sql_select = "SELECT * FROM {}.{}".format(schema, table_name)
conn = create_connection("users", "postgres")
psg_df = load_data(connection=conn, sql_query=sql_select)

this will not error though

In [None]:
print("Shape:, ", psg_df.shape)
psg_df.head(15)

#### comming to loading the sql data to pandas... 

In [210]:
schema = "public"
table_name = "customer"
sql_select = "SELECT * FROM {}.{}".format(schema, table_name)
values = None 
conn = create_connection("users", "postgres")
psg_df = load_data(connection=conn, sql_query=sql_select, values=values)

Connecting to the postgreSQL database ...
Database connection created.
Database connection terminated.


In [None]:
schema = "public"
table_name = "customer"
sql_select = "SELECT * FROM {}.{}".format(schema, table_name)
values = None 
conn = create_connection("users", "postgres")
psg_df = load_data(connection=conn, sql_query=sql_select, values=values)

In [None]:
schema = "public"
table_name = "customer"
sql_select = "SELECT * FROM {}.{}".format(schema, table_name)
values = None 
conn = create_connection("users", "postgres")
psg_df = load_data(connection=conn, sql_query=sql_select, values=values)

In [211]:
print("Shape: ", psg_df.shape)
psg_df.head()

Shape:  (599, 10)


Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,activebool,create_date,last_update,active
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,530,True,2006-02-14,2013-05-26 14:49:45.738,1
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,5,True,2006-02-14,2013-05-26 14:49:45.738,1
2,2,1,Patricia,Johnson,patricia.johnson@sakilacustomer.org,6,True,2006-02-14,2013-05-26 14:49:45.738,1
3,3,1,Linda,Williams,linda.williams@sakilacustomer.org,7,True,2006-02-14,2013-05-26 14:49:45.738,1
4,4,2,Barbara,Jones,barbara.jones@sakilacustomer.org,8,True,2006-02-14,2013-05-26 14:49:45.738,1


In [212]:
# the sql kind
psg_df.query("first_name == 'Jared' and last_name == 'Ely'")

Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,activebool,create_date,last_update,active
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,530,True,2006-02-14,2013-05-26 14:49:45.738,1


In [213]:
# pandas way 
psg_df[(psg_df['first_name'] == 'Jared') & (psg_df['last_name'] == 'Ely')]

Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,activebool,create_date,last_update,active
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,530,True,2006-02-14,2013-05-26 14:49:45.738,1


In [214]:
# or
psg_df[(psg_df.first_name == 'Jared') & (psg_df.last_name == 'Ely')] # AND == & , OR == | 

Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,activebool,create_date,last_update,active
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,530,True,2006-02-14,2013-05-26 14:49:45.738,1


now let us rename the column `email` to `emial_id`

In [217]:
psg_df.rename(columns={"email": "email_id"}, inplace=True)
psg_df.head()
# (row, column) (0,1 )

Unnamed: 0,customer_id,store_id,first_name,last_name,email_id,address_id,activebool,create_date,last_update,active
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,530,True,2006-02-14,2013-05-26 14:49:45.738,1
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,5,True,2006-02-14,2013-05-26 14:49:45.738,1
2,2,1,Patricia,Johnson,patricia.johnson@sakilacustomer.org,6,True,2006-02-14,2013-05-26 14:49:45.738,1
3,3,1,Linda,Williams,linda.williams@sakilacustomer.org,7,True,2006-02-14,2013-05-26 14:49:45.738,1
4,4,2,Barbara,Jones,barbara.jones@sakilacustomer.org,8,True,2006-02-14,2013-05-26 14:49:45.738,1


In [219]:
psg_df.tail(5)

Unnamed: 0,customer_id,store_id,first_name,last_name,email_id,address_id,activebool,create_date,last_update,active
594,595,1,Terrence,Gunderson,terrence.gunderson@sakilacustomer.org,601,True,2006-02-14,2013-05-26 14:49:45.738,1
595,596,1,Enrique,Forsythe,enrique.forsythe@sakilacustomer.org,602,True,2006-02-14,2013-05-26 14:49:45.738,1
596,597,1,Freddie,Duggan,freddie.duggan@sakilacustomer.org,603,True,2006-02-14,2013-05-26 14:49:45.738,1
597,598,1,Wade,Delvalle,wade.delvalle@sakilacustomer.org,604,True,2006-02-14,2013-05-26 14:49:45.738,1
598,599,2,Austin,Cintron,austin.cintron@sakilacustomer.org,605,True,2006-02-14,2013-05-26 14:49:45.738,1


let us check how man active and inactive users we have

In [218]:
psg_df.active.value_counts()

1    584
0     15
Name: active, dtype: int64

let's create a new column called `activitiy_status` and asssing `active` or `inactive` based on previous active column

In [220]:
psg_df['activity_status'] = psg_df['active'].map(lambda x: 'Active' if x==1 else 'Inactive')

In [221]:
psg_df.head()

Unnamed: 0,customer_id,store_id,first_name,last_name,email_id,address_id,activebool,create_date,last_update,active,activity_status
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,530,True,2006-02-14,2013-05-26 14:49:45.738,1,Active
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,5,True,2006-02-14,2013-05-26 14:49:45.738,1,Active
2,2,1,Patricia,Johnson,patricia.johnson@sakilacustomer.org,6,True,2006-02-14,2013-05-26 14:49:45.738,1,Active
3,3,1,Linda,Williams,linda.williams@sakilacustomer.org,7,True,2006-02-14,2013-05-26 14:49:45.738,1,Active
4,4,2,Barbara,Jones,barbara.jones@sakilacustomer.org,8,True,2006-02-14,2013-05-26 14:49:45.738,1,Active


In [222]:
psg_df.activity_status.value_counts()

Active      584
Inactive     15
Name: activity_status, dtype: int64

In [223]:
# or
psg_df.groupby('activity_status').groups

{'Active': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ...], 'Inactive': [16, 64, 124, 169, 241, 271, 315, 368, 406, 446, 482, 510, 533, 557, 591]}

In [224]:
psg_df.groupby('activity_status').groups.keys()

dict_keys(['Active', 'Inactive'])

let us now create a new column called `full_name` and combine first and last name

In [225]:
# first way
psg_df['full_name'] = psg_df['first_name'] + ' ' + psg_df['last_name']
psg_df.head() 

Unnamed: 0,customer_id,store_id,first_name,last_name,email_id,address_id,activebool,create_date,last_update,active,activity_status,full_name
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,530,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Jared Ely
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,5,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Mary Smith
2,2,1,Patricia,Johnson,patricia.johnson@sakilacustomer.org,6,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Patricia Johnson
3,3,1,Linda,Williams,linda.williams@sakilacustomer.org,7,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Linda Williams
4,4,2,Barbara,Jones,barbara.jones@sakilacustomer.org,8,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Barbara Jones


In [229]:
# second way
psg_df['full_name2'] = psg_df[['first_name','last_name']].apply(lambda x: ' '.join(x), axis=1)
psg_df.head() 

Unnamed: 0,customer_id,store_id,first_name,last_name,email_id,address_id,activebool,create_date,last_update,active,activity_status,full_name,full_name2
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,530,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Jared Ely,Jared Ely
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,5,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Mary Smith,Mary Smith
2,2,1,Patricia,Johnson,patricia.johnson@sakilacustomer.org,6,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Patricia Johnson,Patricia Johnson
3,3,1,Linda,Williams,linda.williams@sakilacustomer.org,7,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Linda Williams,Linda Williams
4,4,2,Barbara,Jones,barbara.jones@sakilacustomer.org,8,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Barbara Jones,Barbara Jones


In [230]:
# third way -- this can be used for multiple columns
psg_df['full_name3'] = psg_df[['first_name', 'last_name']].agg(' '.join, axis=1) 
psg_df.head()

Unnamed: 0,customer_id,store_id,first_name,last_name,email_id,address_id,activebool,create_date,last_update,active,activity_status,full_name,full_name2,full_name3
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,530,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Jared Ely,Jared Ely,Jared Ely
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,5,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Mary Smith,Mary Smith,Mary Smith
2,2,1,Patricia,Johnson,patricia.johnson@sakilacustomer.org,6,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Patricia Johnson,Patricia Johnson,Patricia Johnson
3,3,1,Linda,Williams,linda.williams@sakilacustomer.org,7,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Linda Williams,Linda Williams,Linda Williams
4,4,2,Barbara,Jones,barbara.jones@sakilacustomer.org,8,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Barbara Jones,Barbara Jones,Barbara Jones


In [255]:
psg_df.store_id.value_counts()

1    326
2    273
Name: store_id, dtype: int64

## Pandas - Split Column by Delimiter 

Now let us split the email_id column based on `@` and see the different domain names

first let us drop the two duplicated columns we created above 

In [231]:
psg_df.drop(columns=['full_name2', 'full_name3','address_id'], inplace=True)
psg_df.head()

Unnamed: 0,customer_id,store_id,first_name,last_name,email_id,activebool,create_date,last_update,active,activity_status,full_name
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Jared Ely
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Mary Smith
2,2,1,Patricia,Johnson,patricia.johnson@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Patricia Johnson
3,3,1,Linda,Williams,linda.williams@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Linda Williams
4,4,2,Barbara,Jones,barbara.jones@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Barbara Jones


In [232]:
psg_df.dtypes

customer_id                 int64
store_id                    int64
first_name                 object
last_name                  object
email_id                   object
activebool                   bool
create_date                object
last_update        datetime64[ns]
active                      int64
activity_status            object
full_name                  object
dtype: object

In [233]:
# default parameters pandas Series.str.split() function
psg_df['email_id'].str.split('@', n=-1, expand=False).head()

0           [jared.ely, sakilacustomer.org]
1          [mary.smith, sakilacustomer.org]
2    [patricia.johnson, sakilacustomer.org]
3      [linda.williams, sakilacustomer.org]
4       [barbara.jones, sakilacustomer.org]
Name: email_id, dtype: object

In [234]:
# # to split into multiple columns by delimiter
psg_df['email_id'].str.split("@", expand=True).head()

Unnamed: 0,0,1
0,jared.ely,sakilacustomer.org
1,mary.smith,sakilacustomer.org
2,patricia.johnson,sakilacustomer.org
3,linda.williams,sakilacustomer.org
4,barbara.jones,sakilacustomer.org


but what we are trying to do is to split it and add them as column

In [235]:
# split column and add new columns to df
psg_df[['emial_prefix', 'domain_name']] = psg_df['email_id'].str.split('@', expand=True)
# display the dataframe
print("Shape: ", psg_df.shape)
psg_df.head()

Shape:  (599, 13)


Unnamed: 0,customer_id,store_id,first_name,last_name,email_id,activebool,create_date,last_update,active,activity_status,full_name,emial_prefix,domain_name
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Jared Ely,jared.ely,sakilacustomer.org
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Mary Smith,mary.smith,sakilacustomer.org
2,2,1,Patricia,Johnson,patricia.johnson@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Patricia Johnson,patricia.johnson,sakilacustomer.org
3,3,1,Linda,Williams,linda.williams@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Linda Williams,linda.williams,sakilacustomer.org
4,4,2,Barbara,Jones,barbara.jones@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Barbara Jones,barbara.jones,sakilacustomer.org


In [238]:
# let's try 
# what is the distinct count by domain_name
len(psg_df.groupby('domain_name').groups['sakilacustomer.org'])

599

In [241]:
psg_df.domain_name.value_counts()

sakilacustomer.org    599
Name: domain_name, dtype: int64

In [242]:
psg_df.dtypes

customer_id                 int64
store_id                    int64
first_name                 object
last_name                  object
email_id                   object
activebool                   bool
create_date                object
last_update        datetime64[ns]
active                      int64
activity_status            object
full_name                  object
emial_prefix               object
domain_name                object
dtype: object

In [243]:
# let us convert create_date to datetime 
psg_df['create_dt_date'] = pd.to_datetime(psg_df.create_date)
psg_df.head()

Unnamed: 0,customer_id,store_id,first_name,last_name,email_id,activebool,create_date,last_update,active,activity_status,full_name,emial_prefix,domain_name,create_dt_date
0,524,1,Jared,Ely,jared.ely@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Jared Ely,jared.ely,sakilacustomer.org,2006-02-14
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Mary Smith,mary.smith,sakilacustomer.org,2006-02-14
2,2,1,Patricia,Johnson,patricia.johnson@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Patricia Johnson,patricia.johnson,sakilacustomer.org,2006-02-14
3,3,1,Linda,Williams,linda.williams@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Linda Williams,linda.williams,sakilacustomer.org,2006-02-14
4,4,2,Barbara,Jones,barbara.jones@sakilacustomer.org,True,2006-02-14,2013-05-26 14:49:45.738,1,Active,Barbara Jones,barbara.jones,sakilacustomer.org,2006-02-14


In [244]:
psg_df.dtypes

customer_id                 int64
store_id                    int64
first_name                 object
last_name                  object
email_id                   object
activebool                   bool
create_date                object
last_update        datetime64[ns]
active                      int64
activity_status            object
full_name                  object
emial_prefix               object
domain_name                object
create_dt_date     datetime64[ns]
dtype: object

#### add current date

In [245]:
# work with only a couple of fields for now 
cols = ["customer_id","first_name","last_name","email_id","create_date","last_update","activity_status","create_dt_date"]
test_df = pd.DataFrame(psg_df, columns=cols)
test_df.head()

Unnamed: 0,customer_id,first_name,last_name,email_id,create_date,last_update,activity_status,create_dt_date
0,524,Jared,Ely,jared.ely@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14
1,1,Mary,Smith,mary.smith@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14
2,2,Patricia,Johnson,patricia.johnson@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14
3,3,Linda,Williams,linda.williams@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14
4,4,Barbara,Jones,barbara.jones@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14


In [246]:
test_df.dtypes

customer_id                 int64
first_name                 object
last_name                  object
email_id                   object
create_date                object
last_update        datetime64[ns]
activity_status            object
create_dt_date     datetime64[ns]
dtype: object

In [247]:
# add todays date 
dt_format = "%Y-%m-%d %H:%M:%S"
from datetime import datetime
test_df['today_date'] = pd.to_datetime(datetime.today().strftime(dt_format))

In [248]:
test_df.head()

Unnamed: 0,customer_id,first_name,last_name,email_id,create_date,last_update,activity_status,create_dt_date,today_date
0,524,Jared,Ely,jared.ely@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16
1,1,Mary,Smith,mary.smith@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16
2,2,Patricia,Johnson,patricia.johnson@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16
3,3,Linda,Williams,linda.williams@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16
4,4,Barbara,Jones,barbara.jones@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16


In [249]:
test_df.dtypes

customer_id                 int64
first_name                 object
last_name                  object
email_id                   object
create_date                object
last_update        datetime64[ns]
activity_status            object
create_dt_date     datetime64[ns]
today_date         datetime64[ns]
dtype: object

In [250]:
# calcuate the difference between two dates 
test_df['diff_days'] = test_df['today_date'] - test_df['last_update']
test_df.head()

Unnamed: 0,customer_id,first_name,last_name,email_id,create_date,last_update,activity_status,create_dt_date,today_date,diff_days
0,524,Jared,Ely,jared.ely@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359 days 00:41:30.262000
1,1,Mary,Smith,mary.smith@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359 days 00:41:30.262000
2,2,Patricia,Johnson,patricia.johnson@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359 days 00:41:30.262000
3,3,Linda,Williams,linda.williams@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359 days 00:41:30.262000
4,4,Barbara,Jones,barbara.jones@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359 days 00:41:30.262000


In [251]:
# convert the difference in terms of days 
import numpy as np
test_df['diff_days'] = test_df['diff_days'] / np.timedelta64(1,'D')
test_df.head()

Unnamed: 0,customer_id,first_name,last_name,email_id,create_date,last_update,activity_status,create_dt_date,today_date,diff_days
0,524,Jared,Ely,jared.ely@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359.028822
1,1,Mary,Smith,mary.smith@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359.028822
2,2,Patricia,Johnson,patricia.johnson@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359.028822
3,3,Linda,Williams,linda.williams@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359.028822
4,4,Barbara,Jones,barbara.jones@sakilacustomer.org,2006-02-14,2013-05-26 14:49:45.738,Active,2006-02-14,2022-08-06 15:31:16,3359.028822


In [252]:
print(min(test_df.diff_days))
print(max(test_df.diff_days))

3359.028822476852
3359.028822476852


In [253]:
print(min(test_df.create_date))
print(max(test_df.create_date))


2006-02-14
2006-02-14


In [254]:
print(min(test_df.last_update))
print(max(test_df.last_update))

2013-05-26 14:49:45.738000
2013-05-26 14:49:45.738000


#### REFELECTION

In [259]:
schema = "public"
table_name = "actor"
sql_select = "SELECT * FROM {}.{}".format(schema, table_name)
values = None 
conn = create_connection("users", "postgres")
psg_df = load_data(connection=conn, sql_query=sql_select, values=values)

Connecting to the postgreSQL database ...
Database connection created.
Database connection terminated.


```sql
SELECT
	customer_id,
	Total_spend,
	CASE WHEN Total_spend <= 30 THEN 'Bronze'
		 WHEN Total_spend <= 60 THEN 'Silver'
		 WHEN Total_spend <= 90 THEN 'Gold'
		 ELSE  'Diamon'
	END AS Customer_grading
FROM (
	SELECT customer_id, SUM(amount) AS Total_spend
	FROM payment
	GROUP BY customer_id
	ORDER BY 2 ) sub ;
```

In [260]:
psg_df.head()

Unnamed: 0,actor_id,first_name,last_name,last_update
0,1,Penelope,Guiness,2013-05-26 14:47:57.620
1,2,Nick,Wahlberg,2013-05-26 14:47:57.620
2,3,Ed,Chase,2013-05-26 14:47:57.620
3,4,Jennifer,Davis,2013-05-26 14:47:57.620
4,5,Johnny,Lollobrigida,2013-05-26 14:47:57.620


In [269]:
psg_df.shape

(200, 4)

In [270]:
psg_df.groupby(by=['first_name'])['first_name'].sum()

first_name
Adam           AdamAdam
Al                   Al
Alan               Alan
Albert     AlbertAlbert
Alec               Alec
               ...     
Whoopi           Whoopi
Will               Will
William         William
Woody        WoodyWoody
Zero               Zero
Name: first_name, Length: 128, dtype: object