In [1]:
# Import dependencies
import pandas as pd
import psycopg2

# user_credentials.py must be created locally. Initialize variables for username and password in this file.
import user_credentials

from pathlib import Path
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from sqlalchemy import create_engine, exc, text

In [2]:
# Read routes data into dataframe
routes_df = pd.read_csv(Path('data', 'routes_rated.csv'))
routes_df.rename(columns={'name_id' : 'route_id'}, inplace=True)
routes_df.head()

Unnamed: 0,route_id,country,crag,sector,name,tall_recommend_sum,grade_mean,cluster,rating_tot
0,0,and,montserrat,prohibitivo,diagonal de la x,-1,49.25,3,-0.045211
1,1,and,montserrat,prohibitivo,mehir,-1,49.0,2,0.116464
2,2,and,montserrat,prohibitivo,pas de la discordia,0,49.0,2,0.178722
3,3,and,tartareu,bombo suis,tenedor libre,0,44.333333,3,0.158449
4,4,arg,bandurrias,rincon,tendinitis,1,48.5,0,0.075797


In [3]:
# Format case of text columns
routes_df['country'] = routes_df['country'].str.upper()
routes_df['crag'] = routes_df['crag'].str.title()
routes_df['sector'] = routes_df['sector'].str.title()
routes_df['name'] = routes_df['name'].str.title()
routes_df.head()

Unnamed: 0,route_id,country,crag,sector,name,tall_recommend_sum,grade_mean,cluster,rating_tot
0,0,AND,Montserrat,Prohibitivo,Diagonal De La X,-1,49.25,3,-0.045211
1,1,AND,Montserrat,Prohibitivo,Mehir,-1,49.0,2,0.116464
2,2,AND,Montserrat,Prohibitivo,Pas De La Discordia,0,49.0,2,0.178722
3,3,AND,Tartareu,Bombo Suis,Tenedor Libre,0,44.333333,3,0.158449
4,4,ARG,Bandurrias,Rincon,Tendinitis,1,48.5,0,0.075797


In [4]:
# Create a new column called "style" which indicates if the route is preferred by short or tall climbers
def style(x):
    if x < 0:
        return 'Short'
    elif x > 0:
        return 'Tall'
    else:
        return 'Neutral'

routes_df['style'] = routes_df['tall_recommend_sum'].apply(style)
routes_df.head()

Unnamed: 0,route_id,country,crag,sector,name,tall_recommend_sum,grade_mean,cluster,rating_tot,style
0,0,AND,Montserrat,Prohibitivo,Diagonal De La X,-1,49.25,3,-0.045211,Short
1,1,AND,Montserrat,Prohibitivo,Mehir,-1,49.0,2,0.116464,Short
2,2,AND,Montserrat,Prohibitivo,Pas De La Discordia,0,49.0,2,0.178722,Neutral
3,3,AND,Tartareu,Bombo Suis,Tenedor Libre,0,44.333333,3,0.158449,Neutral
4,4,ARG,Bandurrias,Rincon,Tendinitis,1,48.5,0,0.075797,Tall


In [5]:
# Clean up the dataframe by removing unnecessary columns
routes_df = routes_df.drop('tall_recommend_sum', axis=1)
routes_df.head()

Unnamed: 0,route_id,country,crag,sector,name,grade_mean,cluster,rating_tot,style
0,0,AND,Montserrat,Prohibitivo,Diagonal De La X,49.25,3,-0.045211,Short
1,1,AND,Montserrat,Prohibitivo,Mehir,49.0,2,0.116464,Short
2,2,AND,Montserrat,Prohibitivo,Pas De La Discordia,49.0,2,0.178722,Neutral
3,3,AND,Tartareu,Bombo Suis,Tenedor Libre,44.333333,3,0.158449,Neutral
4,4,ARG,Bandurrias,Rincon,Tendinitis,48.5,0,0.075797,Tall


In [6]:
# Change grade_mean column from float to int to allow for grade conversions.
routes_df['grade_mean'] = routes_df['grade_mean'].astype(int)
routes_df.head()

Unnamed: 0,route_id,country,crag,sector,name,grade_mean,cluster,rating_tot,style
0,0,AND,Montserrat,Prohibitivo,Diagonal De La X,49,3,-0.045211,Short
1,1,AND,Montserrat,Prohibitivo,Mehir,49,2,0.116464,Short
2,2,AND,Montserrat,Prohibitivo,Pas De La Discordia,49,2,0.178722,Neutral
3,3,AND,Tartareu,Bombo Suis,Tenedor Libre,44,3,0.158449,Neutral
4,4,ARG,Bandurrias,Rincon,Tendinitis,48,0,0.075797,Tall


In [7]:
# Connect to postgres and create a database
database_name = 'climbing_db'
try:
    conn = psycopg2.connect(f'user={user_credentials.username} password={user_credentials.password}')
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    cursor = conn.cursor()
    cursor.execute(f'CREATE DATABASE {database_name};')
except psycopg2.errors.DuplicateDatabase: 
    print(f'{database_name} database already exists')
finally:
    cursor.close()
    conn.close()

In [8]:
# Set connection to new created database using psycopg2
host = 'localhost'
port = '5432'
try:
    conn = psycopg2.connect(database=database_name, user=user_credentials.username, password=user_credentials.password, host =host, port=port)
except psycopg2.errors.OperationalError:
    print("Database connection not successful") 

In [10]:
# Create a table using psycopg2 connection
routes_table = 'routes'
cursor = conn.cursor()
table_creation = f'''
   CREATE TABLE IF NOT EXISTS {routes_table} (
       route_id INT PRIMARY KEY,
       country VARCHAR(3),
       crag TEXT NOT NULL,
       sector TEXT NOT NULL,
       name TEXT NOT NULL,
       grade_mean INT,
       cluster INT,
       rating_tot FLOAT,
       style TEXT NOT NULL
   );
'''
cursor.execute(table_creation)
conn.commit()

In [11]:
# Insert dataframe into database table
try:
    engine = create_engine(f'postgresql://{user_credentials.username}:{user_credentials.password}@{host}:{port}/{database_name}')
    routes_df.to_sql(routes_table, engine, if_exists='append', index = False)
except exc.IntegrityError:
    print('Attempted to insert a duplicate key. Check whether your data is already present in the database.')

In [12]:
# Query from database to confirm data has been loaded
cursor.execute(f'SELECT * from {routes_table};')
route_data = cursor.fetchall()
conn.commit()

for i in range(5): print(route_data[i])
print(f'...\n\nTotal rows = {len(route_data)}')

(0, 'AND', 'Montserrat', 'Prohibitivo', 'Diagonal De La X', 49, 3, -0.045211449, 'Short')
(1, 'AND', 'Montserrat', 'Prohibitivo', 'Mehir', 49, 2, 0.116464061, 'Short')
(2, 'AND', 'Montserrat', 'Prohibitivo', 'Pas De La Discordia', 49, 2, 0.17872175, 'Neutral')
(3, 'AND', 'Tartareu', 'Bombo Suis', 'Tenedor Libre', 44, 3, 0.158449225, 'Neutral')
(4, 'ARG', 'Bandurrias', 'Rincon', 'Tendinitis', 48, 0, 0.075797184, 'Tall')
...

Total rows = 55858


In [22]:
# Read climber data in dataframe
climber_df = pd.read_csv(Path('data', 'climber_df.csv'))
climber_df.head()

Unnamed: 0,user_id,country,sex,height,weight,age,years_cl,date_first,date_last,grades_count,grades_first,grades_last,grades_max,grades_mean,year_first,year_last
0,1,SWE,0,177,73,41.0,21,1999-02-06 23:00:00,2001-07-31 22:00:00,84,36,55,62,46.75,1999,2001
1,3,SWE,0,180,78,44.0,22,1999-03-31 22:00:00,2000-07-19 22:00:00,12,53,51,59,52.833333,1999,2000
2,4,SWE,1,165,58,33.0,16,2004-06-30 22:00:00,2009-05-26 22:00:00,119,53,49,64,53.890756,2004,2009
3,10,SWE,0,167,63,52.0,25,2000-01-14 23:00:00,2017-06-01 22:00:00,298,53,49,63,49.40604,2000,2017
4,16,NOR,0,177,68,44.0,21,1998-02-27 23:00:00,2010-05-13 22:00:00,5,53,49,53,51.4,1998,2010


In [23]:
# Check Datatype for each column
climber_df.dtypes

user_id           int64
country          object
sex               int64
height            int64
weight            int64
age             float64
years_cl          int64
date_first       object
date_last        object
grades_count      int64
grades_first      int64
grades_last       int64
grades_max        int64
grades_mean     float64
year_first        int64
year_last         int64
dtype: object

In [24]:
# checking for Null (After execution there is no null in the table)
climber_df.isnull().values.any()

False

In [25]:
# Replace sex column values where 0 = Male and 1 = Female
climber_df['sex'].replace([0, 1], ['M', 'F'], inplace=True)
climber_df.head(10)

Unnamed: 0,user_id,country,sex,height,weight,age,years_cl,date_first,date_last,grades_count,grades_first,grades_last,grades_max,grades_mean,year_first,year_last
0,1,SWE,M,177,73,41.0,21,1999-02-06 23:00:00,2001-07-31 22:00:00,84,36,55,62,46.75,1999,2001
1,3,SWE,M,180,78,44.0,22,1999-03-31 22:00:00,2000-07-19 22:00:00,12,53,51,59,52.833333,1999,2000
2,4,SWE,F,165,58,33.0,16,2004-06-30 22:00:00,2009-05-26 22:00:00,119,53,49,64,53.890756,2004,2009
3,10,SWE,M,167,63,52.0,25,2000-01-14 23:00:00,2017-06-01 22:00:00,298,53,49,63,49.40604,2000,2017
4,16,NOR,M,177,68,44.0,21,1998-02-27 23:00:00,2010-05-13 22:00:00,5,53,49,53,51.4,1998,2010
5,17,SWE,M,193,78,42.0,17,2001-06-19 22:00:00,2002-04-30 22:00:00,4,36,36,36,34.5,2001,2002
6,19,BEL,M,180,68,36.0,21,2000-08-07 22:00:00,2002-03-11 23:00:00,32,49,46,49,37.25,2000,2002
7,28,CAN,M,180,68,45.0,29,2000-09-01 22:00:00,2017-08-25 22:00:00,86,53,46,64,50.395349,2000,2017
8,34,other,M,180,68,21.0,24,1999-12-31 23:00:00,2000-12-31 23:00:00,11,51,46,53,48.363636,2000,2001
9,38,GBR,M,178,73,35.0,24,2000-11-03 23:00:00,2017-07-21 22:00:00,323,55,55,71,57.736842,2000,2017


In [26]:
# Convert ages to age ranges to anonymize age data
max_age = int(climber_df['age'].max())
age_bins = [x for x in range(0, max_age + 5, 5)]
age_labels = [f'{i+1}-{j}' for i, j in zip(age_bins[:-1], age_bins[1:])]
climber_df['age'] = pd.cut(climber_df['age'], bins=age_bins, labels=age_labels)
climber_df.head()

Unnamed: 0,user_id,country,sex,height,weight,age,years_cl,date_first,date_last,grades_count,grades_first,grades_last,grades_max,grades_mean,year_first,year_last
0,1,SWE,M,177,73,41-45,21,1999-02-06 23:00:00,2001-07-31 22:00:00,84,36,55,62,46.75,1999,2001
1,3,SWE,M,180,78,41-45,22,1999-03-31 22:00:00,2000-07-19 22:00:00,12,53,51,59,52.833333,1999,2000
2,4,SWE,F,165,58,31-35,16,2004-06-30 22:00:00,2009-05-26 22:00:00,119,53,49,64,53.890756,2004,2009
3,10,SWE,M,167,63,51-55,25,2000-01-14 23:00:00,2017-06-01 22:00:00,298,53,49,63,49.40604,2000,2017
4,16,NOR,M,177,68,41-45,21,1998-02-27 23:00:00,2010-05-13 22:00:00,5,53,49,53,51.4,1998,2010


In [27]:
# Rename columns for clarity 
climber_df.rename(columns={'height': 'height_cm', 'weight': 'weight_kg', 'age': 'age_range'}, inplace=True)

# change datatype of column
climber_df['date_first'] = pd.to_datetime(climber_df['date_first'])
climber_df['date_last'] = pd.to_datetime(climber_df['date_last'])
climber_df.dtypes

user_id                  int64
country                 object
sex                     object
height_cm                int64
weight_kg                int64
age_range             category
years_cl                 int64
date_first      datetime64[ns]
date_last       datetime64[ns]
grades_count             int64
grades_first             int64
grades_last              int64
grades_max               int64
grades_mean            float64
year_first               int64
year_last                int64
dtype: object

In [28]:
# Create a table using psycopg2 connection
climber_table = 'climbers'
cursor = conn.cursor()
table_creation = f'''
   CREATE TABLE IF NOT EXISTS {climber_table}(
    user_id INT PRIMARY KEY,
    country VARCHAR (5) NOT NULL,
    sex CHAR(1) NOT NULL,
    height_cm INT,
    weight_kg INT,
    age_range VARCHAR(5) NOT NULL,
    years_cl INT,
	date_first VARCHAR (20),
	date_last VARCHAR(20),
	grades_count INT,
	grades_first INT,
	grades_last INT,
	grades_max INT,
	grades_mean FLOAT,
	Year_first INT,
	year_last INT)
'''
cursor.execute(table_creation)
conn.commit()

In [29]:
# Insert dataframe into database table
try:
    engine = create_engine(f'postgresql://{user_credentials.username}:{user_credentials.password}@{host}:{port}/{database_name}')
    climber_df.to_sql(climber_table, engine, if_exists='append', index = False)
except exc.IntegrityError:
    print('Attempted to insert a duplicate key. Check whether your data is already present in the database.')

In [30]:
# Query from database to confirm data has been loaded
cursor.execute(f'SELECT * from {climber_table};')
climber_data = cursor.fetchall()
conn.commit()

for i in range(5): print(climber_data[i])
print(f'...\n\nTotal rows = {len(climber_data)}')

(1, 'SWE', 'M', 177, 73, '41-45', 21, '1999-02-06 23:00:00', '2001-07-31 22:00:00', 84, 36, 55, 62, 46.75, 1999, 2001)
(3, 'SWE', 'M', 180, 78, '41-45', 22, '1999-03-31 22:00:00', '2000-07-19 22:00:00', 12, 53, 51, 59, 52.833333333333336, 1999, 2000)
(4, 'SWE', 'F', 165, 58, '31-35', 16, '2004-06-30 22:00:00', '2009-05-26 22:00:00', 119, 53, 49, 64, 53.890756302521005, 2004, 2009)
(10, 'SWE', 'M', 167, 63, '51-55', 25, '2000-01-14 23:00:00', '2017-06-01 22:00:00', 298, 53, 49, 63, 49.40604026845637, 2000, 2017)
(16, 'NOR', 'M', 177, 68, '41-45', 21, '1998-02-27 23:00:00', '2010-05-13 22:00:00', 5, 53, 49, 53, 51.4, 1998, 2010)
...

Total rows = 10927


In [31]:
#Read country_codes.csv into dataframe
countries_df = pd.read_csv(Path('data', 'country_codes.csv'))

In [32]:
# Create a table using psycopg2 connection
countries_table = 'countries'
cursor = conn.cursor()
table_creation = f'''
   CREATE TABLE IF NOT EXISTS {countries_table} (
       country_id VARCHAR(5) PRIMARY KEY,
       country VARCHAR(100)
   );
'''
cursor.execute(table_creation)
conn.commit()

In [33]:
# Insert dataframe into database table
try:
    engine = create_engine(f'postgresql://{user_credentials.username}:{user_credentials.password}@{host}:{port}/{database_name}')
    countries_df.to_sql(countries_table, engine, if_exists='append', index = False)
except exc.IntegrityError:
    print('Attempted to insert a duplicate key. Check whether your data is already present in the database.')

In [34]:
#Read clusters.csv into dataframe
clusters_df = pd.read_csv(Path('data', 'clusters.csv'))

In [35]:
# Create a table using psycopg2 connection
clusters_table = 'clusters'
cursor = conn.cursor()
table_creation = f'''
   CREATE TABLE IF NOT EXISTS {clusters_table} (
       cluster_id INT PRIMARY KEY,
       description VARCHAR
   );
'''
cursor.execute(table_creation)
conn.commit()

In [36]:
# Insert dataframe into database table
try:
    engine = create_engine(f'postgresql://{user_credentials.username}:{user_credentials.password}@{host}:{port}/{database_name}')
    clusters_df.to_sql(clusters_table, engine, if_exists='append', index = False)
except exc.IntegrityError:
    print('Attempted to insert a duplicate key. Check whether your data is already present in the database.')

In [37]:
#Read grades_conversions.csv into dataframe
grades_df = pd.read_csv(Path('data', 'grades_conversion_table.csv'))

In [40]:
# Create a table using psycopg2 connection
grades_table = 'grades'
cursor = conn.cursor()
table_creation = f'''
   CREATE TABLE IF NOT EXISTS {grades_table} (
       grade_id INT PRIMARY KEY,
       grade_fra VARCHAR(15),
       grade_yds VARCHAR(15),
       grade_v VARCHAR(15)
   );
'''
cursor.execute(table_creation)
conn.commit()

In [42]:
# Insert dataframe into database table
try:
    engine = create_engine(f'postgresql://{user_credentials.username}:{user_credentials.password}@{host}:{port}/{database_name}')
    grades_df.to_sql(grades_table, engine, if_exists='append', index = False)
except exc.IntegrityError:
    print('Attempted to insert a duplicate key. Check whether your data is already present in the database.')

Attempted to insert a duplicate key. Check whether your data is already present in the database.


In [43]:
cursor.close()
conn.close()