In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import psycopg2
from sqlalchemy import create_engine
from decimal import Decimal

In [2]:
def get_connection():

    user=os.environ['PG_USER']
    password=os.environ['PG_PASSWORD']
    host=os.environ['PG_HOST']
    port=os.environ['PG_PORT']
    database=os.environ['PG_DBNAME']
    
    conn_string = f'postgresql://{user}:{password}@{host}:{port}/{database}'
    db = create_engine(conn_string)
    
    return db.connect()


In [44]:
customers_df = pd.read_csv('datasets/Customers.csv', encoding='unicode_escape')

In [52]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15266 entries, 0 to 15265
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   CustomerKey  15266 non-null  int64         
 1   Gender       15266 non-null  object        
 2   Name         15266 non-null  object        
 3   City         15266 non-null  object        
 4   State Code   15266 non-null  object        
 5   State        15266 non-null  object        
 6   Zip Code     15266 non-null  object        
 7   Country      15266 non-null  object        
 8   Continent    15266 non-null  object        
 9   Birthday     15266 non-null  object        
 10  fBirthday    15266 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 1.3+ MB


In [11]:
len(customers_df['CustomerKey'].unique())

15266

In [12]:
customers_df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [14]:
len(customers_df['City'].unique())

8258

In [20]:
customers_df.isna().sum()

CustomerKey     0
Gender          0
Name            0
City            0
State Code     10
State           0
Zip Code        0
Country         0
Continent       0
Birthday        0
dtype: int64

In [35]:
customers_df[customers_df['State Code'].isna()]

Unnamed: 0,CustomerKey,Gender,Name,City,State Code,State,Zip Code,Country,Continent,Birthday
5304,729681,Female,Rossana Padovesi,Polvica,,Napoli,80035,Italy,Europe,4/18/1981
5316,732289,Male,Indro Piccio,Varcaturo,,Napoli,80014,Italy,Europe,2/24/1949
5372,742042,Male,Amaranto Loggia,Casaferro,,Napoli,80034,Italy,Europe,3/14/1936
5377,742886,Female,Edmonda Capon,Terzigno,,Napoli,80040,Italy,Europe,8/6/1963
5378,743343,Female,Ambra Sagese,Pomigliano D'Arco,,Napoli,80038,Italy,Europe,1/5/1961
5485,759705,Male,Callisto Lo Duca,Casilli,,Napoli,80047,Italy,Europe,8/28/1976
5525,765589,Male,Michelino Lucchesi,Pompei Scavi,,Napoli,80045,Italy,Europe,11/13/1947
5531,766410,Male,Adelmio Beneventi,Licola,,Napoli,80078,Italy,Europe,1/13/1940
5631,781667,Female,Ilda Manna,Napoli,,Napoli,80134,Italy,Europe,5/8/1977
5695,789177,Male,Calogero Folliero,Mariglianella,,Napoli,80030,Italy,Europe,3/3/2000


In [42]:
customers_df.loc[customers_df['State'] == 'Napoli']

Unnamed: 0,CustomerKey,Gender,Name,City,State Code,State,Zip Code,Country,Continent,Birthday


In [38]:
customers_df.loc[customers_df['Country'] == 'Italy']

Unnamed: 0,CustomerKey,Gender,Name,City,State Code,State,Zip Code,Country,Continent,Birthday
5116,700308,Female,Nicoletta Angelo,Natile,RC,Reggio Calabria,89030,Italy,Europe,12/18/1946
5117,700309,Male,Filippo Trevisani,Pietrelcina,BN,Benevento,82020,Italy,Europe,10/1/1995
5118,700354,Female,Elga Sagese,Rettorgole,VI,Vicenza,36030,Italy,Europe,11/17/1957
5119,700619,Male,Flaviano Castiglione,Brazzolo,FE,Ferrara,44035,Italy,Europe,2/28/1945
5120,700909,Male,Davide Siciliani,Tamara,FE,Ferrara,44030,Italy,Europe,1/18/1943
...,...,...,...,...,...,...,...,...,...,...
5756,799323,Male,Domenico Fiorentino,Circello,BN,Benevento,82020,Italy,Europe,8/6/1946
5757,799366,Male,Berardo Onio,San Biagio In Padule,MO,Modena,41038,Italy,Europe,10/21/1944
5758,799620,Female,Ilda Ricci,Bagno Di Gavorrano,GR,Grosseto,58021,Italy,Europe,8/12/1936
5759,799689,Male,Francesco Sabbatini,Savoniero,MO,Modena,41046,Italy,Europe,3/25/1944


In [48]:
customers_df.loc[customers_df['State'] == 'Napoli']

Unnamed: 0,CustomerKey,Gender,Name,City,State Code,State,Zip Code,Country,Continent,Birthday
5304,729681,Female,Rossana Padovesi,Polvica,,Napoli,80035,Italy,Europe,4/18/1981
5316,732289,Male,Indro Piccio,Varcaturo,,Napoli,80014,Italy,Europe,2/24/1949
5372,742042,Male,Amaranto Loggia,Casaferro,,Napoli,80034,Italy,Europe,3/14/1936
5377,742886,Female,Edmonda Capon,Terzigno,,Napoli,80040,Italy,Europe,8/6/1963
5378,743343,Female,Ambra Sagese,Pomigliano D'Arco,,Napoli,80038,Italy,Europe,1/5/1961
5485,759705,Male,Callisto Lo Duca,Casilli,,Napoli,80047,Italy,Europe,8/28/1976
5525,765589,Male,Michelino Lucchesi,Pompei Scavi,,Napoli,80045,Italy,Europe,11/13/1947
5531,766410,Male,Adelmio Beneventi,Licola,,Napoli,80078,Italy,Europe,1/13/1940
5631,781667,Female,Ilda Manna,Napoli,,Napoli,80134,Italy,Europe,5/8/1977
5695,789177,Male,Calogero Folliero,Mariglianella,,Napoli,80030,Italy,Europe,3/3/2000


In [47]:
customers_df.loc[customers_df['State'] == 'Napoli', 'State Code'] = 'NA'

In [50]:
customers_df['fBirthday'] = pd.to_datetime(customers_df['Birthday'], format='mixed', dayfirst=True)

In [51]:
customers_df

Unnamed: 0,CustomerKey,Gender,Name,City,State Code,State,Zip Code,Country,Continent,Birthday,fBirthday
0,301,Female,Lilly Harding,WANDEARAH EAST,SA,South Australia,5523,Australia,Australia,7/3/1939,1939-03-07
1,325,Female,Madison Hull,MOUNT BUDD,WA,Western Australia,6522,Australia,Australia,9/27/1979,1979-09-27
2,554,Female,Claire Ferres,WINJALLOK,VIC,Victoria,3380,Australia,Australia,5/26/1947,1947-05-26
3,786,Male,Jai Poltpalingada,MIDDLE RIVER,SA,South Australia,5223,Australia,Australia,9/17/1957,1957-09-17
4,1042,Male,Aidan Pankhurst,TAWONGA SOUTH,VIC,Victoria,3698,Australia,Australia,11/19/1965,1965-11-19
...,...,...,...,...,...,...,...,...,...,...,...
15261,2099600,Female,Denisa Duková,Houston,TX,Texas,77017,United States,North America,3/25/1936,1936-03-25
15262,2099618,Male,Justin Solórzano,Mclean,VA,Virginia,22101,United States,North America,2/16/1992,1992-02-16
15263,2099758,Male,Svend Petrussen,Wilmington,NC,North Carolina,28405,United States,North America,11/9/1937,1937-09-11
15264,2099862,Female,Lorenza Rush,Riverside,CA,California,92501,United States,North America,10/12/1937,1937-12-10


In [53]:
customers_df['Country'].unique()

array(['Australia', 'Canada', 'Germany', 'France', 'Italy', 'Netherlands',
       'United Kingdom', 'United States'], dtype=object)

In [55]:
customers_df['Continent'].unique()

array(['Australia', 'North America', 'Europe'], dtype=object)

In [56]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15266 entries, 0 to 15265
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   CustomerKey  15266 non-null  int64         
 1   Gender       15266 non-null  object        
 2   Name         15266 non-null  object        
 3   City         15266 non-null  object        
 4   State Code   15266 non-null  object        
 5   State        15266 non-null  object        
 6   Zip Code     15266 non-null  object        
 7   Country      15266 non-null  object        
 8   Continent    15266 non-null  object        
 9   Birthday     15266 non-null  object        
 10  fBirthday    15266 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 1.3+ MB


In [57]:
#create table cp2_customers(customer_key integer not null, gender varchar(10) not null, customer_name varchar(255) not null, city varchar(255) not null, state_code varchar(255) not null, customer_state varchar(255) not null, zip_code varchar(255) not null, country varchar(255) not null, continent varchar(255) not null, birthday date not null, primary key(customer_key))

customers_df[['CustomerKey','Gender','Name','City','State Code','State','Zip Code','Country','Continent','fBirthday']].to_sql('cp2_customers', get_connection(), if_exists='replace', index=False)


266

In [59]:
customers_df.duplicated().unique()

array([False])

In [64]:
customers_df.drop(['CustomerKey'], axis=1).duplicated().unique()

array([False])