# RotoWire dataset

## 1. Basic Stats
![stats](datastat.png)
Summaries are written by professionals. The writing is colloquial but relatively **well structured**. 

## 2. EDA

In [1]:
import random
from collections import defaultdict

import pandas as pd
from IPython.core.display import display, HTML

pd.set_option('display.max_columns', 30)
display(HTML("<style>.container { width:100% !important; }</style>")) # expand cell width for visibility

In [2]:
train_path = 'data/rotowire/train.json'
valid_path = 'data/rotowire/valid.json'
test_path = 'data/rotowire/test.json'

train = pd.read_json(train_path)
valid = pd.read_json(valid_path)
test = pd.read_json(test_path)

### 2.1 raw data
- box_score: 선수들의 기록. 그런데 선수별로 정리되어있지 않음
- home_line: 홈팀 기록
- vis_line: 원정팀 기록
- summary: '정답' text

In [3]:
train.head()

Unnamed: 0,box_score,day,home_city,home_line,home_name,summary,vis_city,vis_line,vis_name
0,"{'FIRST_NAME': {'24': 'Ron', '25': 'Sasha', '2...",122516,New York,"{'TEAM-PTS_QTR2': '20', 'TEAM-FT_PCT': '89', '...",Knicks,"[The, Celtics, saw, great, team, play, in, the...",Boston,"{'TEAM-PTS_QTR2': '34', 'TEAM-FT_PCT': '75', '...",Celtics
1,"{'TO': {'24': 'N/A', '25': 'N/A', '20': '2', '...",40415,Phoenix,"{'TEAM-PTS_QTR2': '34', 'TEAM-FT_PCT': '76', '...",Suns,"[The, Phoenix, Suns, (, 39, -, 38, ), escaped,...",Utah,"{'TEAM-PTS_QTR2': '19', 'TEAM-FT_PCT': '83', '...",Jazz
2,"{'FIRST_NAME': {'24': 'Shabazz', '20': 'Al-Far...",32817,Portland,"{'TEAM-PTS_QTR2': '34', 'TEAM-FT_PCT': '73', '...",Trail Blazers,"[The, host, Portland, Trail, Blazers, defeated...",Denver,"{'TEAM-PTS_QTR2': '29', 'TEAM-FT_PCT': '80', '...",Nuggets
3,"{'TO': {'24': 'N/A', '25': 'N/A', '20': '0', '...",110314,Los Angeles,"{'TEAM-PTS_QTR2': '30', 'TEAM-FT_PCT': '88', '...",Clippers,"[The, Los, Angeles, Clippers, (, 3, -, 1, ), d...",Utah,"{'TEAM-PTS_QTR2': '23', 'TEAM-FT_PCT': '79', '...",Jazz
4,"{'TO': {'24': '1', '25': '0', '20': '0', '21':...",121916,Chicago,"{'TEAM-PTS_QTR2': '34', 'TEAM-FT_PCT': '100', ...",Bulls,"[Wire-to-wire, wins, are, uncommon, in, the, N...",Detroit,"{'TEAM-PTS_QTR2': '15', 'TEAM-FT_PCT': '85', '...",Pistons


### 2.2 column별로 보기
#### 2.2.1 boxscore 선수별로 보기

In [4]:
def get_ith_record(data, i):
    assert i is None or i <= len(data)
    if i is None:
        i = random.randint(0, len(data))
    return data.iloc[i, :]

In [5]:
def view_player_stat(data, i=None):
    """i번째 경기 기록 중 box score을 선수별로 정리하기"""
    record = get_ith_record(data, i)
    boxscore = record['box_score']
    idx_to_name = boxscore['PLAYER_NAME']
    boxscore_by_player = defaultdict(dict)
    for r_type, r_entity_value in boxscore.items():
        # if 'NAME' in r_type: continue
        for entity_idx, value in r_entity_value.items():
            boxscore_by_player[idx_to_name[entity_idx]][r_type] = value
    return pd.DataFrame.from_dict(boxscore_by_player, orient='index')

In [6]:
i = 3
view_player_stat(train)

Unnamed: 0,TO,FIRST_NAME,MIN,REB,FG3A,PLAYER_NAME,AST,FG3M,OREB,FGM,START_POSITION,PF,PTS,FGA,STL,FTA,BLK,DREB,FTM,FT_PCT,FG_PCT,FG3_PCT,SECOND_NAME,TEAM_CITY
Aaron Gordon,2,Aaron,31,16,0,Aaron Gordon,0,0,6,6,F,1,12,12,0,0,1,10,0,0,50,0,Gordon,Orlando
Andrew Nicholson,0,Andrew,3,0,0,Andrew Nicholson,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,0,Nicholson,Orlando
Boban Marjanovic,0,Boban,3,0,0,Boban Marjanovic,0,0,0,0,,0,2,0,0,2,1,0,2,100,0,0,Marjanovic,San Antonio
Boris Diaw,0,Boris,17,6,0,Boris Diaw,2,0,2,1,C,0,2,1,1,0,0,4,0,0,100,0,Diaw,San Antonio
Channing Frye,1,Channing,17,6,4,Channing Frye,0,1,1,3,,3,7,7,0,0,0,5,0,0,43,25,Frye,Orlando
Danny Green,2,Danny,25,6,7,Danny Green,2,2,0,3,G,5,8,9,1,0,0,6,0,0,33,29,Green,San Antonio
David West,2,David,17,3,0,David West,1,0,0,3,,4,6,5,0,0,0,3,0,0,60,0,West,San Antonio
Dewayne Dedmon,0,Dewayne,3,0,0,Dewayne Dedmon,0,0,0,1,,1,2,2,0,0,0,0,0,0,50,0,Dedmon,Orlando
Elfrid Payton,1,Elfrid,32,5,1,Elfrid Payton,8,0,3,3,G,2,6,12,1,1,1,2,0,0,25,0,Payton,Orlando
Evan Fournier,3,Evan,18,1,2,Evan Fournier,0,1,1,2,,1,5,5,0,0,0,0,0,0,40,50,Fournier,Orlando


#### 2.2.2 team stat 보기

In [7]:
def view_team_stat(data, i, team):
    """i번째 기록에서 home 혹은 away 팀에 대한 기록 보기"""
    assert team in {'home', 'away'}
    def rename(team):
        rename_dict = {'home': 'home_line', 'away': 'vis_line'}
        return rename_dict[team]
    record = get_ith_record(data, i)
    team_stat = record[rename(team)]
    return pd.DataFrame(team_stat, index=[0])

In [8]:
view_team_stat(train, i, 'home')

Unnamed: 0,TEAM-PTS_QTR2,TEAM-FT_PCT,TEAM-PTS_QTR1,TEAM-PTS_QTR4,TEAM-PTS_QTR3,TEAM-CITY,TEAM-PTS,TEAM-AST,TEAM-LOSSES,TEAM-NAME,TEAM-WINS,TEAM-REB,TEAM-TOV,TEAM-FG3_PCT,TEAM-FG_PCT
0,30,88,26,34,17,Los Angeles,107,28,1,Clippers,3,35,10,44,51


In [9]:
view_team_stat(train, i, 'away')

Unnamed: 0,TEAM-PTS_QTR2,TEAM-FT_PCT,TEAM-PTS_QTR1,TEAM-PTS_QTR4,TEAM-PTS_QTR3,TEAM-CITY,TEAM-PTS,TEAM-AST,TEAM-LOSSES,TEAM-NAME,TEAM-WINS,TEAM-REB,TEAM-TOV,TEAM-FG3_PCT,TEAM-FG_PCT
0,23,79,20,31,27,Utah,101,24,3,Jazz,1,43,17,37,47


In [10]:
df = view_player_stat(train, 3)
df2 = view_team_stat(train, 3, 'away')
df3 = view_team_stat(train, 3, 'home')
types = set(list(df.columns) + list(df2.columns) + list(df3.columns))
print(sorted(types), len(types))

['AST', 'BLK', 'DREB', 'FG3A', 'FG3M', 'FG3_PCT', 'FGA', 'FGM', 'FG_PCT', 'FIRST_NAME', 'FTA', 'FTM', 'FT_PCT', 'MIN', 'OREB', 'PF', 'PLAYER_NAME', 'PTS', 'REB', 'SECOND_NAME', 'START_POSITION', 'STL', 'TEAM-AST', 'TEAM-CITY', 'TEAM-FG3_PCT', 'TEAM-FG_PCT', 'TEAM-FT_PCT', 'TEAM-LOSSES', 'TEAM-NAME', 'TEAM-PTS', 'TEAM-PTS_QTR1', 'TEAM-PTS_QTR2', 'TEAM-PTS_QTR3', 'TEAM-PTS_QTR4', 'TEAM-REB', 'TEAM-TOV', 'TEAM-WINS', 'TEAM_CITY', 'TO'] 39


#### 2.2.3 summary 보기

In [11]:
def view_summary(data, i=None):
    """i번째 기록에 대한 summary text보기"""
    record = get_ith_record(data, i)
    return ' '.join(record['summary'])

In [12]:
view_summary(train, i)

"The Los Angeles Clippers ( 3 - 1 ) defeated the Utah Jazz ( 1 - 3 ) 107 - 101 on Monday . Chris Paul recorded the NBA 's first triple - double of the 2014 - 15 season , putting up 13 points , 10 rebounds , and 12 assists in 36 minutes . He also threw in two steals and a blocked shot for good measure . Teammate Blake Griffin helped Paul convert some of those assists , shooting 14 - of - 21 from the field and finishing with 31 points in 38 minutes . J.J. Redick started at shooting guard but played a season - low 16 minutes as coach Doc Rivers opted to go with Jamal Crawford most of the night . Crawford missed the Clippers ' last game with a rib injury but returned to put up 19 points off the bench in 34 minutes . Gordon Hayward put up a LeBron James-esque line of 27 points , seven rebounds , and five assists in 36 minutes . He was aided in keeping the game close by starting power forward Enes Kanter who put up 17 points and nine rebounds in 27 minutes . The Jazz were relatively efficien

### 2.3 한 번에 보기

In [13]:
data = train
i = 189

In [14]:
display(view_team_stat(train, i, 'home'))
display(view_team_stat(train, i, 'away'))
display(view_summary(train, i))
display(view_player_stat(train, i))

Unnamed: 0,TEAM-PTS_QTR2,TEAM-FT_PCT,TEAM-PTS_QTR1,TEAM-PTS_QTR4,TEAM-PTS_QTR3,TEAM-CITY,TEAM-PTS,TEAM-AST,TEAM-LOSSES,TEAM-NAME,TEAM-WINS,TEAM-REB,TEAM-TOV,TEAM-FG3_PCT,TEAM-FG_PCT
0,28,78,34,15,26,Minnesota,103,21,37,Timberwolves,27,46,11,29,44


Unnamed: 0,TEAM-PTS_QTR2,TEAM-FT_PCT,TEAM-PTS_QTR1,TEAM-PTS_QTR4,TEAM-PTS_QTR3,TEAM-CITY,TEAM-PTS,TEAM-AST,TEAM-LOSSES,TEAM-NAME,TEAM-WINS,TEAM-REB,TEAM-TOV,TEAM-FG3_PCT,TEAM-FG_PCT
0,29,77,24,26,23,Golden State,102,25,13,Warriors,52,43,12,29,47


"The Minnesota Timberwolves defeated the visiting Golden State Warriors 103 - 102 , at Target Center , on Friday evening . It was a dramatic contest , as the Timberwolves jumped out to a 10 - point lead , with a 34 - 24 first quarter . The Warriors responded in the second , where they out - scored the T-Wolves by one . The Timberwolves came back strong in the third , ensuring themselves a 12 - point lead heading into the final quarter . Minnesota then extended their lead to 14 , however the Warriors went on a 130 run to bring the game within one point with 7:28 to go . It was then back and forth , with the T-Wolves staying on top , until the Warriors tied it up with 2:37 left . Minnesota battled ahead once again but almost let it slip away as they were up by one , when Andrew Wiggins missed two free throws , allowing the Warriors the opportunity to go up by one on the next possession . Wiggins made up for his error though , as he sank two free throws with 12 seconds left to seal the on

Unnamed: 0,FIRST_NAME,MIN,FGM,REB,FG3A,PLAYER_NAME,AST,FG3M,OREB,TO,START_POSITION,PF,PTS,FGA,STL,FTA,BLK,DREB,FTM,FT_PCT,FG_PCT,FG3_PCT,SECOND_NAME,TEAM_CITY
Adreian Payne,Adreian,,,,,Adreian Payne,,,,,,,,,,,,,,,,,Payne,Minnesota
Andre Iguodala,Andre,32.0,4.0,2.0,1.0,Andre Iguodala,1.0,0.0,2.0,0.0,,0.0,8.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,67.0,0.0,Iguodala,Golden State
Andrew Wiggins,Andrew,37.0,9.0,4.0,1.0,Andrew Wiggins,0.0,1.0,2.0,4.0,F,3.0,24.0,21.0,2.0,7.0,0.0,2.0,5.0,71.0,43.0,100.0,Wiggins,Minnesota
Brandon Rush,Brandon,20.0,1.0,3.0,1.0,Brandon Rush,3.0,0.0,2.0,0.0,G,1.0,2.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,33.0,0.0,Rush,Minnesota
Cole Aldrich,Cole,,,,,Cole Aldrich,,,,,,,,,,,,,,,,,Aldrich,Minnesota
David West,David,13.0,0.0,4.0,1.0,David West,1.0,0.0,0.0,0.0,,1.0,0.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,West,Golden State
Draymond Green,Draymond,38.0,1.0,7.0,3.0,Draymond Green,7.0,0.0,3.0,0.0,F,4.0,5.0,8.0,2.0,4.0,1.0,4.0,3.0,75.0,13.0,0.0,Green,Golden State
Gorgui Dieng,Gorgui,24.0,5.0,4.0,0.0,Gorgui Dieng,0.0,0.0,0.0,0.0,F,2.0,12.0,10.0,1.0,2.0,1.0,4.0,2.0,100.0,50.0,0.0,Dieng,Minnesota
Ian Clark,Ian,13.0,4.0,4.0,3.0,Ian Clark,2.0,2.0,1.0,0.0,,0.0,10.0,7.0,1.0,0.0,0.0,3.0,0.0,0.0,57.0,67.0,Clark,Golden State
JaVale McGee,JaVale,9.0,1.0,3.0,0.0,JaVale McGee,0.0,0.0,1.0,1.0,,2.0,3.0,2.0,0.0,2.0,3.0,2.0,1.0,50.0,50.0,0.0,McGee,Golden State


In [19]:
df1 = view_player_stat(train, 3)
df2 = view_team_stat(train, 3, 'home')

In [27]:
len(df2.columns)

15