In [1]:
# Import Pandas and NumPy
import pandas as pd
import numpy as np

# Import SQL Alchemy
from sqlalchemy import create_engine
import psycopg2

# Import Password
from config import password

In [2]:
# userprofile
# Save path to data set in a variable
userprofile = "Resources/userprofile.csv"

# Use Pandas to read data
userprofile_df = pd.read_csv(userprofile, encoding="ISO-8859-1")
userprofile_df.head()

Unnamed: 0,userID,latitude,longitude,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,personality,religion,activity,color,weight,budget,height
0,U1001,22.139997,-100.978803,False,abstemious,informal,family,on foot,single,independent,1989,variety,thrifty-protector,none,student,black,69,medium,1.77
1,U1002,22.150087,-100.983325,False,abstemious,informal,family,public,single,independent,1990,technology,hunter-ostentatious,Catholic,student,red,40,low,1.87
2,U1003,22.119847,-100.946527,False,social drinker,formal,family,public,single,independent,1989,none,hard-worker,Catholic,student,blue,60,low,1.69
3,U1004,18.867,-99.183,False,abstemious,informal,family,public,single,independent,1940,variety,hard-worker,none,professional,green,44,medium,1.53
4,U1005,22.183477,-100.959891,False,abstemious,no preference,family,public,single,independent,1992,none,thrifty-protector,Catholic,student,black,65,medium,1.69


Data Cleaning

In [3]:
# get list of all columns
userprofile_df.columns

Index(['userID', 'latitude', 'longitude', 'smoker', 'drink_level',
       'dress_preference', 'ambience', 'transport', 'marital_status', 'hijos',
       'birth_year', 'interest', 'personality', 'religion', 'activity',
       'color', 'weight', 'budget', 'height'],
      dtype='object')

In [4]:
# Select specific columns
userprofile_df = userprofile_df[["userID", "latitude", "longitude", "smoker", "drink_level", "ambience", 
                         "transport", "religion"]]
userprofile_df.head()

Unnamed: 0,userID,latitude,longitude,smoker,drink_level,ambience,transport,religion
0,U1001,22.139997,-100.978803,False,abstemious,family,on foot,none
1,U1002,22.150087,-100.983325,False,abstemious,family,public,Catholic
2,U1003,22.119847,-100.946527,False,social drinker,family,public,Catholic
3,U1004,18.867,-99.183,False,abstemious,family,public,none
4,U1005,22.183477,-100.959891,False,abstemious,family,public,Catholic


In [5]:
# Rename UserID column to userid
userprofile_df = userprofile_df.rename(columns={"userID": "userid"})

In [6]:
# Replace the values having ? with Nan
userprofile_df = userprofile_df.replace('?', "Not Recorded")
userprofile_df.head()

Unnamed: 0,userid,latitude,longitude,smoker,drink_level,ambience,transport,religion
0,U1001,22.139997,-100.978803,False,abstemious,family,on foot,none
1,U1002,22.150087,-100.983325,False,abstemious,family,public,Catholic
2,U1003,22.119847,-100.946527,False,social drinker,family,public,Catholic
3,U1004,18.867,-99.183,False,abstemious,family,public,none
4,U1005,22.183477,-100.959891,False,abstemious,family,public,Catholic


In [7]:
# Verify row which had ? and now it got replaced with Nan
userID = userprofile_df.loc[userprofile_df["userid"] == "U1024", :]
userID

Unnamed: 0,userid,latitude,longitude,smoker,drink_level,ambience,transport,religion
23,U1024,22.154021,-100.976028,Not Recorded,abstemious,Not Recorded,Not Recorded,none


In [8]:
# Verify another row which had ? and now it got replaced with Nan
userID = userprofile_df.loc[userprofile_df["userid"] == "U1039", :]
userID

Unnamed: 0,userid,latitude,longitude,smoker,drink_level,ambience,transport,religion
38,U1039,23.738067,-99.139906,False,social drinker,friends,Not Recorded,none


In [9]:
# check all columns with any missing/null values
userprofile_df.isna().sum()

userid         0
latitude       0
longitude      0
smoker         0
drink_level    0
ambience       0
transport      0
religion       0
dtype: int64

In [10]:
# check all duplicate rows
duplicate_rows_df = userprofile_df[userprofile_df.duplicated()]
print (f"Number of duplicate rows: {duplicate_rows_df.shape}")

Number of duplicate rows: (0, 8)


Database

In [11]:
# connect to Postgres
engine = create_engine(f"postgresql://postgres:{password}@localhost/restaurant_rating_db")
conn = engine.connect()

In [12]:
# Insert data into User_Profile table
userprofile_df.to_sql(name='user_profile', con=engine, if_exists='append', index=False)

In [13]:
# Query the data in Postgres
USER_PROFILE = pd.read_sql("SELECT * FROM USER_PROFILE", conn)
USER_PROFILE.head(10)

Unnamed: 0,userid,latitude,longitude,smoker,drink_level,ambience,transport,religion
0,U1001,22.139997,-100.978803,False,abstemious,family,on foot,none
1,U1002,22.150087,-100.983325,False,abstemious,family,public,Catholic
2,U1003,22.119847,-100.946527,False,social drinker,family,public,Catholic
3,U1004,18.867,-99.183,False,abstemious,family,public,none
4,U1005,22.183477,-100.959891,False,abstemious,family,public,Catholic
5,U1006,22.15,-100.983,True,social drinker,friends,car owner,none
6,U1007,22.118464,-100.938256,False,casual drinker,solitary,public,Catholic
7,U1008,22.122989,-100.923811,False,social drinker,solitary,public,Catholic
8,U1009,22.159427,-100.990448,False,abstemious,family,on foot,Catholic
9,U1010,22.190889,-100.998669,False,social drinker,friends,car owner,none
