This notebook is used to clean the data and get it into a relational table format.

In [3]:
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

# Unpack the nested map structure to the needed user data in a table

In [47]:
users_packed = pd.read_json('../data/user_ticks.json', lines=True)

In [48]:
users_packed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7350 entries, 0 to 7349
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   _id      7350 non-null   object
 1   user_id  7350 non-null   int64 
 2   ticks    7350 non-null   object
dtypes: int64(1), object(2)
memory usage: 172.4+ KB


In [65]:
users = pd.DataFrame(users_packed.loc[0,['ticks']][0])
users['user_id'] = users_packed.loc[0,['user_id']][0]

for idx in range(1, users_packed.count()[0]):
    new_df = pd.DataFrame(users_packed.loc[idx,['ticks']][0])
    new_df['user_id'] = users_packed.loc[idx,['user_id']][0]
    users = users.append(new_df, ignore_index=True)

In [75]:
users = users.rename(columns={
    'routeId': 'route_id', 
    'leadStyle': 'lead_style', 
    'tickId': 'tick_id', 
    'userStars': 'user_stars', 
    'userRating': 'user_rating'})
users = users.astype({'route_id': 'int32', 'tick_id': 'int32'})

In [165]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 576208 entries, 0 to 576207
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   route_id     576208 non-null  int32  
 1   date         576208 non-null  object 
 2   pitches      576208 non-null  float64
 3   notes        576208 non-null  object 
 4   style        576208 non-null  object 
 5   lead_style   576208 non-null  object 
 6   tick_id      576208 non-null  int32  
 7   user_stars   576208 non-null  float64
 8   user_rating  576208 non-null  object 
 9   user_id      576208 non-null  int64  
dtypes: float64(2), int32(2), int64(1), object(5)
memory usage: 44.0+ MB


In [166]:
users

Unnamed: 0,route_id,date,pitches,notes,style,lead_style,tick_id,user_stars,user_rating,user_id
0,106002151,2019-11-02,1.0,,TR,,118003759,-1.0,,200527767
1,106241707,2019-11-02,1.0,,TR,,118003758,-1.0,,200527767
2,106589615,2019-11-02,1.0,,Lead,,118003757,-1.0,,200527767
3,106002147,2019-11-02,1.0,,Lead,,117978517,-1.0,,200527767
4,106810828,2019-11-02,1.0,,TR,,117978240,-1.0,,200527767
...,...,...,...,...,...,...,...,...,...,...
576203,107723683,2017-12-30,1.0,onsight,,,113971775,-1.0,,106930862
576204,106224432,2017-12-30,2.0,onsight,,,113971766,3.0,,106930862
576205,106224397,2017-12-30,1.0,flash,,,113971764,2.0,,106930862
576206,105895574,2017-12-22,1.0,onsight,,,113948901,-1.0,,106930862


In [167]:
users.to_csv('../data/user_ticks.csv', header=True, index=False)

# Convert routes to a table and add numerical rating

In [4]:
routes = pd.read_json('../data/routes.json', lines=True)

In [5]:
routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80495 entries, 0 to 80494
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   _id          80495 non-null  object 
 1   id           80495 non-null  int64  
 2   name         80495 non-null  object 
 3   type         80495 non-null  object 
 4   rating       80495 non-null  object 
 5   stars        80495 non-null  float64
 6   starVotes    80495 non-null  int64  
 7   pitches      80495 non-null  object 
 8   location     80495 non-null  object 
 9   url          80495 non-null  object 
 10  imgSqSmall   80495 non-null  object 
 11  imgSmall     80495 non-null  object 
 12  imgSmallMed  80495 non-null  object 
 13  imgMedium    80495 non-null  object 
 14  longitude    80495 non-null  float64
 15  latitude     80495 non-null  float64
dtypes: float64(3), int64(2), object(11)
memory usage: 9.8+ MB


In [6]:
routes = routes.drop(columns=['_id','imgSqSmall','imgSmall','imgSmallMed','imgMedium', 'url'])
routes = routes.drop_duplicates(subset='id',ignore_index=True)
routes = routes.rename(columns={'id': 'route_id', 'starVotes': 'star_votes'})
routes['pitches'] = routes['pitches'].apply(lambda x: int(x) if x else 1.0)

In [7]:
routes['base_rating'] = routes['rating']
routes['base_rating'] = routes['base_rating'].apply(lambda x: x.split()[0] if len(x.split()) > 0 else '')

A complex rating conversion scheme was developed to be able to look at routes with a numerical rating for statistics comparisons. Only routes with a "5.x" or "Vx" rating were considered. An extra .01 was added to the "Vx" rated routes to be able to distinguish from "5.x".

In [40]:
rating_list = [rating for rating in sorted(routes.loc[:,'base_rating'].unique()) if rating and rating[0] in '5V']
rating_conv = {}
alpha_conv = {'a': .2, 'b': .4, 'c': .6, 'd': .8, 'a/b': .3, 'b/c': .5, 'c/d': .7, '+': .3, '-': -.3}
for rating in rating_list:
    if rating[0] == 'V':
        if len(rating) == 2 and rating[1].isdigit():
            rating_conv[rating] = int(rating[1]) + .01
        elif len(rating) > 2 and rating[2].isdigit():
            if len(rating) == 4:
                rating_conv[rating] = np.round(int(rating[1:3]) + .01 + alpha_conv[rating[3]],2)
            elif len(rating) > 4:
                rating_conv[rating] = np.round(int(rating[1:3]) + .51, 2)
            else:
                rating_conv[rating] = np.round(int(rating[1:3]) + .01, 2)
    elif len(rating) == 3:
        rating_conv[rating] = int(rating[2])
    elif len(rating) > 3 and rating[2] == '1':
        if len(rating) > 4:
            rating_conv[rating] = int(rating[2:4]) + alpha_conv[rating[4:]]
        else:
            rating_conv[rating] = int(rating[2:4])
    else:
        rating_conv[rating] = int(rating[2]) + alpha_conv[rating[3:]]
rev_rating_conv = {v:k for k,v in rating_conv.items()}
json.dump(rating_conv, open("rating_conv.json", 'w' ))
json.dump(rev_rating_conv, open("rev_rating_conv.json", 'w' ))

In [41]:
routes['num_rating'] = routes['base_rating'].apply(lambda x: rating_conv[x] if x in rating_conv.keys() else np.nan)

In [42]:
routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76202 entries, 0 to 76201
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   route_id     76202 non-null  int64  
 1   name         76202 non-null  object 
 2   type         76202 non-null  object 
 3   rating       76202 non-null  object 
 4   stars        76202 non-null  float64
 5   star_votes   76202 non-null  int64  
 6   pitches      76202 non-null  float64
 7   location     76202 non-null  object 
 8   longitude    76202 non-null  float64
 9   latitude     76202 non-null  float64
 10  base_rating  76202 non-null  object 
 11  num_rating   71151 non-null  float64
dtypes: float64(5), int64(2), object(5)
memory usage: 7.0+ MB


In [43]:
routes

Unnamed: 0,route_id,name,type,rating,stars,star_votes,pitches,location,longitude,latitude,base_rating,num_rating
0,105806397,The Grand Wall,"Trad, Aid",5.11a A0,4.9,324,9.0,"[International, North America, Canada, British...",-123.1480,49.6822,5.11a,11.20
1,105806955,Exasperator,Trad,5.10c,5.0,450,2.0,"[International, North America, Canada, British...",-123.1481,49.6823,5.10c,10.60
2,105947052,Klahanie Crack,Trad,5.7,4.7,465,1.0,"[International, North America, Canada, British...",-123.1578,49.6696,5.7,7.00
3,105842838,Crime of the Century,Trad,5.11b/c,4.8,229,1.0,"[International, North America, Canada, British...",-123.1378,49.7050,5.11b/c,11.50
4,107198282,Skywalker,Trad,5.8,4.5,473,5.0,"[International, North America, Canada, British...",-123.1583,49.6693,5.8,8.00
...,...,...,...,...,...,...,...,...,...,...,...,...
76197,106430462,Animal Acts,Boulder,V5,4.0,32,1.0,"[Texas, Hueco Tanks, West Mountain, Scream]",-106.0456,31.9129,V5,5.01
76198,106561511,Sunnyside Bench Regular Route,Trad,5.5,3.5,268,3.0,"[California, Yosemite National Park, Yosemite ...",-119.5949,37.7508,5.5,5.00
76199,107610087,Super Final,Sport,5.12b,4.5,25,1.0,"[Tennessee, Obed & Clear Creek, South Clear Cr...",-84.7083,36.0947,5.12b,12.40
76200,107741178,Minnowmaker,Sport,5.12a/b,4.7,11,1.0,"[Utah, Wasatch Range, Logan, China Wall]",-111.6422,41.7854,5.12a/b,12.30


In [44]:
routes.to_csv('../data/routes.csv', header=True, index=False)