In [None]:
import subprocess
import json
import pandas as pd
import os

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [None]:
import importlib
import data
importlib.reload(data)
from data import DataLoader

prem2425 = '9n12waklv005j8r32sfjj2eqc'

# prem_loader = DataLoader(prem2425)

# statsperform API + bq backloading

In [None]:
def _access_statsperform_api(feed_name: str,
                             tourney_cal_id: str = None,
                             match_id: str = None
                             ):

    proxy_url = "http://127.0.0.1:3128"
    statsperform_base_url = 'https://api.performfeeds.com/soccerdata'
    auth_key = str(os.environ['STATSPERFORM_API_KEY'])

    master_dict = {
        '_rt':'b',
        '_fmt':'json'
    }

    if feed_name == 'match':
        assert tourney_cal_id is not None, "To access match feed data, a tournament calendar ID must be passed in."
        master_dict['tmcl'] = tourney_cal_id
        master_dict['_pgSz'] = 1000
    if feed_name == 'matchstats':
        assert match_id is not None, "To access match stats feed, a match ID must be passed in."
        master_dict['fx'] = match_id
        master_dict['detailed'] = 'yes'
        master_dict['people'] = 'yes'

    query_string = '&'.join([f'{k}={v}' for k, v in master_dict.items()])

    q_command = f""" curl -x "{proxy_url}" '{statsperform_base_url}/{feed_name}/{auth_key}/authorized?&{query_string}' """ if feed_name == 'tournamentcalendar' else \
        f""" curl -x "{proxy_url}" '{statsperform_base_url}/{feed_name}/{auth_key}?&{query_string}' """

    # print(q_command)

    process = subprocess.run(
                q_command,
                shell=True,
                capture_output=True,
                check=False # Set to True if you want a CalledProcessError for non-zero exit codes
            )

    stdout = process.stdout
    json_output = json.loads(stdout)
    return json_output


## tournaments

In [None]:

def get_statsperform_tourneys(to_bq=False):
    # Get all available tournament calendar IDs with OT2 feed
    feed = 'tournamentcalendar'
    comps = _access_statsperform_api(feed)
    competitions = pd.DataFrame(comps['competition'])
    competitions = competitions.explode('tournamentCalendar')

    extended_tourneys = pd.json_normalize(competitions['tournamentCalendar'])
    extended_tourneys.columns = ['tourneyCalId', 'includesVenues', 'tcOcId', 'tcName', 'startDate', 'endDate', 'active', 'lastUpdated', 'includesStandings']

    final_df = pd.concat([competitions.drop(columns=['tournamentCalendar']).reset_index(drop=True),
                      extended_tourneys.reset_index(drop=True)], axis=1)
                                
    if to_bq:
        try:
            final_df.to_gbq('soccer_simulations.tourneycal_data',
                    'prizepicksanalytics',
                    if_exists='fail')
        except Exception as e:
            print("Table already exists on BQ!")

    return final_df

## matches

In [None]:
# Get all matches with MA1 feed
def _get_all_matches_in_tourneycal(tourney_cal_id: str):

    all_matches = _access_statsperform_api(feed_name='match',
                         tourney_cal_id=tourney_cal_id)

    return pd.DataFrame([x['matchInfo'] for x in all_matches['match']])

## process data

In [None]:
matches = _get_all_matches_in_tourneycal(tourney_cal_id)

In [None]:
from tqdm.notebook import tqdm_notebook
import config
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
teams, players = backload_season_data(tourney_cal_id)

In [None]:

# getting access to player data using MA2 feed
def backload_season_data(tourney_cal_id, 
                         game_limit=None,
                         project_name: str = 'prizepicksanalytics',
                        table_name: str = f'soccer_simulations.schema_match_data'):
    
    matches = _get_all_matches_in_tourneycal(tourney_cal_id)

    processed_match_ids = list(get_processed_match_ids().match_id)
    list_of_matches = [x for x in matches.id.unique() if x not in processed_match_ids]

    if game_limit:
        list_of_matches = list_of_matches[:game_limit]

    all_teams = []
    all_players = []

    for i, id in enumerate(list_of_matches):

        if id in processed_match_ids:
            continue

        print(f"Now processing match #{i}: {id}")
        team, player = process_game_data(id)

        all_players.append(player)
        all_teams.append(team)
        

    if all_teams and all_players:
        team_data_to_bq = pd.concat(all_teams, axis=0, ignore_index=True)
        team_data_to_bq.to_gbq(table_name.replace('schema', 'team'),
                project_name,
                if_exists='append')
        
        player_data_to_bq = pd.concat(all_players, axis=0, ignore_index=True)
        player_data_to_bq.to_gbq(table_name.replace('schema', 'player'),
                project_name,
                if_exists='append')
        
        print(f"Team and Player Data Uploaded for the following Game Ids: {team_data_to_bq.match_id.unique()}")
        return team, player
    else:
        return None, None

In [None]:
# players.to_csv('./test_players.csv', index=False)
# teams.to_csv('./test_teams.csv', index=False)
# players = pd.read_csv('./test_players.csv', index_col=0)
# teams = pd.read_csv('./test_teams.csv', index_col=0)

In [None]:
from google.cloud import bigquery as bqc


# DataFrame should be of only one match
def check_bq_duplicates(dataframe: pd.DataFrame,
                        schema: str,
                        project_name: str = 'prizepicksanalytics',
                        table_name: str = f'soccer_simulations.schema_match_data'):
    
    assert dataframe.shape[0] != 0, "Empty dataframes not accepted"
    assert schema in ['team', 'player'], "Only team and player data accepted"
    table_name = table_name.replace('schema', schema)

    match_ids = list(dataframe.match_id.unique())

    column_key = 'team_id' if schema == 'team' else 'playerId'

    query = f"""
        select match_id, {column_key}
        from {project_name}.{table_name}
        where match_id in ({", ".join(["'" + x + "'" for x in match_ids])}) 
    """

    rows_on_bq = execute_bq_query(query)

    if rows_on_bq.shape[0] == dataframe.shape[0]:
        print(f"Data already loaded onto BQ for Match Id: {match_ids} Schema: {schema}")
        return
    else:
        dataframe = dataframe.merge(rows_on_bq,
                                    on=['match_id', column_key],
                                    how='left',
                                    indicator=True)
        
        dataframe = dataframe[dataframe['_merge'] == 'left_only'].drop(columns=['_merge'])
        

    return dataframe

def get_processed_match_ids():
    query = """
        select distinct match_id
        from prizepicksanalytics.soccer_simulations.team_match_data
    """

    return execute_bq_query(query)


def execute_bq_query(query: str) -> pd.DataFrame:
    bigquery_client = bqc.Client(project='prizepicksanalytics')
    query_results = bigquery_client.query(query)
    data = query_results.to_dataframe()
    bigquery_client.close()

    return data



In [None]:
def process_game_data(match_id):
    
    match_data = _access_statsperform_api(feed_name='matchstats',
                         match_id=match_id)
    
    # TODO: 
    team_data = _aggregate_team_data(match_data)

    player_data = _aggregate_player_data(match_data)
    
    return team_data, player_data


In [None]:
# method to aggregate team data

def _aggregate_team_data(match_stats):

    # collect tournament calendar, competition and team information
    tourney_cal_id, tourney_cal_season = match_stats['matchInfo']['tournamentCalendar']['id'], \
        match_stats['matchInfo']['tournamentCalendar']['name']
    competition_id, competition_name = match_stats['matchInfo']['competition']['id'], \
        match_stats['matchInfo']['competition']['name']
    
    competitors = match_stats['matchInfo']['contestant']
    
    # turn match data into dataframe
    match_data = pd.DataFrame(match_stats['liveData']['lineUp'])

    match_list = []

    for _, team in match_data.iterrows():
        # turn statistics into numerics and get team object from competitor
        team_stats = pd.DataFrame(team.stat)
        team_stats['value'] = pd.to_numeric(team_stats.value)
        competitor = competitors[0] if competitors[0]['id'] == team.contestantId else competitors[1]

        # transpose statistics to make ts dataframe (team statistics)
        ts = team_stats[['type', 'value']].set_index('type').T

        stat_cols = ts.columns
        missing_cols = set(config.all_team_match_stats).difference(stat_cols)

        if missing_cols:
            missing_df = pd.DataFrame(0, index=ts.index, columns=list(missing_cols), dtype=float)
            ts = pd.concat([ts, missing_df], axis=1)

        # create dataframe for cs (config statistics)
        cs = pd.DataFrame({
            'competition_id': [competition_id],
            'competition_name': [competition_name],
            'tourney_cal_id': [tourney_cal_id],
            'tourney_cal_name': [tourney_cal_season],
            'match_id' : [match_stats['matchInfo']['id']],
            'match_date': match_stats['matchInfo']['date'],
            'team_id' : [str(competitor['id'])],
            'team_name': [str(competitor['shortName'])],
            'home' : True if competitor['position'] == 'home' else False,
        })

        team_row = pd.concat([cs.reset_index(drop=True), ts[config.all_team_match_stats].reset_index(drop=True)], axis=1)
        team_row['formationUsed'] = str(team.formationUsed)

        match_list.append(team_row)

    # return dataframe of len == 2 with rows representing both teams, sharing the same game_id
    return pd.concat(match_list).fillna(0)

In [None]:
# method to aggregate player data

def _aggregate_player_data(match_request):
    match_data = pd.DataFrame(match_request['liveData']['lineUp'])

    subs = pd.DataFrame(match_request['liveData']['substitute']).set_index('playerOnId').to_dict('index')

    list_of_dfs = []

    for _, team in match_data.iterrows():
        players = pd.DataFrame(team['player'])

        # replace substitution position with boolean column
        players['isSub'] = players.position.apply(lambda x: True if x == 'Substitute' else False)
        cols_to_keep = ['playerId', 'matchName', 'position', 'positionSide', 'isSub']

        # turn statistics json into pandas dataframe + make all numbers into floats
        stats = players.stat.apply(lambda lisa: {x['type']:float(x['value']) for x in lisa})
        players = pd.concat([players[cols_to_keep].reset_index(drop=True), pd.json_normalize(stats).reset_index(drop=True)], axis=1)

        # remove all players from dataframe who didn't play
        players = players[~players['minsPlayed'].isna()]

        # checks all subs in players dataframe, replace values with subbed out player info
        subbed_players = players[players['isSub'] == True]

        # replace substitution position info with player who subbed out for substitute
        for i, row in subbed_players.iterrows():
            sub_info = subs[row.playerId]
            subbed_out_player = players[players['playerId'] == sub_info['playerOffId']].iloc[0]

            players.at[i, 'position'] =  subbed_out_player.position
            players.at[i, 'positionSide'] =  subbed_out_player.positionSide
            players.at[i, 'formationPlace'] =  subbed_out_player.formationPlace

        # add config columns
        players.insert(0, 'match_id', match_request['matchInfo']['id'])
        players.insert(1, 'team_id', team.contestantId)
        config_cols = ['match_id', 'team_id'] + cols_to_keep

        # add all columns not found in player data to keep StatsPerform detailed player statistics schema 
        # (schema can be found in config.py)
        stat_cols = players.drop(config_cols, axis=1).columns
        missing_cols = set(config.all_player_match_stats).difference(stat_cols)

        if missing_cols:
            missing_df = pd.DataFrame(0, index=players.index, columns=list(missing_cols), dtype=float)
            players = pd.concat([players, missing_df], axis=1)

        list_of_dfs.append(players[config_cols + config.all_player_match_stats])

    # return dataframe of all players with their statistics
    return pd.concat(list_of_dfs, axis=0, ignore_index=True).fillna(0)
    

# feature engineering

## data loading

In [None]:
def get_all_team_data():
        query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.team_match_data
                """
        
        return prem_loader.execute_bq_query(query)

def get_all_player_data():
        query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.player_match_data
                """
        
        return prem_loader.execute_bq_query(query)

def get_historic_team_data(match):

        home_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.team_match_data
                where team_id = '{match.home_id}'
                and date(REPLACE(match_date, 'Z', '')) < date('{match.localDate}')
                """
        
        away_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.team_match_data
                where team_id = '{match.away_id}'
                and date(REPLACE(match_date, 'Z', '')) < date('{match.localDate}')
                """


        return prem_loader.execute_bq_query(home_query),   prem_loader.execute_bq_query(away_query)


# def get_historic_player_data(match,
#                              list_of_player_ids):
        
#         player_query = f"""
#                 select *
#                 from prizepicksanalytics.soccer_simulations.player_match_data
#                 where playerId in ({", ".join(["'" + x + "'" for x in list_of_player_ids])})
#                 and date(REPLACE(match_date, 'Z', '')) < date('{match.localDate}')
#                 """


#         return prem_loader.execute_bq_query(player_query)

        # return player_query

def get_squads(match):

        home_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.squad_data
                where contestantId = '{match.home_id}'
                """
        
        away_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.squad_data
                where contestantId = '{match.away_id}'
                """


        return prem_loader.execute_bq_query(home_query), prem_loader.execute_bq_query(away_query)

In [None]:
# away_query = f"""
#                 select *
#                 from prizepicksanalytics.soccer_simulations.squad_data
#                 """


# squad_data = prem_loader.execute_bq_query(away_query)


# tourneys = get_statsperform_tourneys()
# team_data = get_all_team_data()
# team_data.to_csv('team_data.csv', index=False)
# player_data = get_all_player_data()
# player_data.to_csv('player_data.csv', index=False)
# ars_lfc_match = prem_loader.matches.iloc[290]

## agg methods

In [None]:
def _get_team_stats_vector(team_data,
                           opposite_team_data,
                           position):
    
    home = True if position == 'home' else False
    position_performances = team_data[team_data['home'] == home]

    if team_data.shape[0] == 0:
        avg_possession = np.nan
        avg_finalThirdEntries = np.nan
        avg_accurateBackZonePass = np.nan
        avg_accurateFwdZonePass = np.nan

        avg_possWonAtt3rd = np.nan
        avg_possWonMid3rd = np.nan
        avg_interception = np.nan
    else:
        avg_possession = team_data.possessionPercentage.mean()
        avg_finalThirdEntries = team_data.finalThirdEntries.mean()
        avg_accurateBackZonePass = team_data.accurateBackZonePass.mean()
        avg_accurateFwdZonePass = team_data.accurateFwdZonePass.mean()

        avg_possWonAtt3rd = team_data.possWonAtt3rd.mean()
        avg_possWonMid3rd = team_data.possWonMid3rd.mean()
        avg_interception = team_data.interception.mean()

    if position_performances.shape[0] == 0:
        ps_possession = np.nan
        ps_finalThirdEntries = np.nan
        ps_accurateBackZonePass = np.nan
        ps_accurateFwdZonePass = np.nan

        ps_possWonAtt3rd = np.nan
        ps_possWonMid3rd = np.nan
        ps_interception = np.nan
    else:
        ps_possession = position_performances.possessionPercentage.mean()
        ps_finalThirdEntries = position_performances.finalThirdEntries.mean()
        ps_accurateBackZonePass = position_performances.accurateBackZonePass.mean()
        ps_accurateFwdZonePass = position_performances.accurateFwdZonePass.mean()

        ps_possWonAtt3rd = position_performances.possWonAtt3rd.mean()
        ps_possWonMid3rd = position_performances.possWonMid3rd.mean()
        ps_interception = position_performances.interception.mean()
    
    
    return {
        'avg_possession': avg_possession,
        'avg_finalThirdEntries': avg_finalThirdEntries,
        'avg_accurateBackZonePass': avg_accurateBackZonePass,
        'avg_accurateFwdZonePass': avg_accurateFwdZonePass,
        'avg_possWonAtt3rd': avg_possWonAtt3rd,
        'avg_possWonMid3rd': avg_possWonMid3rd,
        'avg_interception': avg_interception,

        'ps_possession': ps_possession,
        'ps_finalThirdEntries': ps_finalThirdEntries,
        'ps_accurateBackZonePass': ps_accurateBackZonePass,
        'ps_accurateFwdZonePass': ps_accurateFwdZonePass,
        'ps_possWonAtt3rd': ps_possWonAtt3rd,
        'ps_possWonMid3rd': ps_possWonMid3rd,
        'ps_interception': ps_interception,
    }

def _get_player_stats(player_performances,
                      position):
    home = True if position == 'home' else False
    position_performances = player_performances[player_performances['home'] == home]

    if team_data.shape[0] == 0:
        avg_passAtt = np.nan
        avg_minsPlayed = np.nan
        # avg_finalThirdEntries = np.nan
        # avg_accurateBackZonePass = np.nan
        # avg_accurateFwdZonePass = np.nan

        # avg_possWonAtt3rd = np.nan
        # avg_possWonMid3rd = np.nan
        # avg_interception = np.nan
    else:
        avg_passAtt = player_performances.totalPass.mean()
        avg_minsPlayed = player_performances.minsPlayed.mean()
        # avg_finalThirdEntries = team_data.finalThirdEntries.mean()
        # avg_accurateBackZonePass = team_data.accurateBackZonePass.mean()
        # avg_accurateFwdZonePass = team_data.accurateFwdZonePass.mean()

        # avg_possWonAtt3rd = team_data.possWonAtt3rd.mean()
        # avg_possWonMid3rd = team_data.possWonMid3rd.mean()
        # avg_interception = team_data.interception.mean()

    if position_performances.shape[0] == 0:
        ps_passAtt = np.nan
        ps_minsPlayed = np.nan
        # ps_finalThirdEntries = np.nan
        # ps_accurateBackZonePass = np.nan
        # ps_accurateFwdZonePass = np.nan

        # ps_possWonAtt3rd = np.nan
        # ps_possWonMid3rd = np.nan
        # ps_interception = np.nan
    else:
        ps_passAtt = position_performances.totalPass.mean()
        ps_minsPlayed = position_performances.minsPlayed.mean()
        # ps_finalThirdEntries = position_performances.finalThirdEntries.mean()
        # ps_accurateBackZonePass = position_performances.accurateBackZonePass.mean()
        # ps_accurateFwdZonePass = position_performances.accurateFwdZonePass.mean()

        # ps_possWonAtt3rd = position_performances.possWonAtt3rd.mean()
        # ps_possWonMid3rd = position_performances.possWonMid3rd.mean()
        # ps_interception = position_performances.interception.mean()
    
    
    return {
        # 'player_id':player_performances.matchName.iloc[0],
        'avg_passes': avg_passAtt,
        'avg_minsPlayed': avg_minsPlayed,
        # 'avg_accurateBackZonePass': avg_accurateBackZonePass,
        # 'avg_accurateFwdZonePass': avg_accurateFwdZonePass,
        # 'avg_possWonAtt3rd': avg_possWonAtt3rd,
        # 'avg_possWonMid3rd': avg_possWonMid3rd,
        # 'avg_interception': avg_interception,

        'ps_passes': ps_passAtt,
        'ps_minsPlayed': ps_minsPlayed,
        # 'ps_accurateBackZonePass': ps_accurateBackZonePass,
        # 'ps_accurateFwdZonePass': ps_accurateFwdZonePass,
        # 'ps_possWonAtt3rd': ps_possWonAtt3rd,
        # 'ps_possWonMid3rd': ps_possWonMid3rd,
        # 'ps_interception': ps_interception,
    }

## get match vectors

In [None]:
import numpy as np

    
def get_match_vectors(match,
                      all_performances = None,
                      all_squads=None,
                      all_player_data=None,
                      training=True):

    final_feat_list = []

    team_ids = {'home':match.home_id,
                'away':match.away_id}

    if all_performances is None:
        home_data, away_data = get_historic_team_data(match)
    else:
        home_data = all_performances[(all_performances['team_id'] == team_ids['home']) &
                                     (all_performances['match_date'].apply(lambda x: x.replace('Z', '')) < match.localDate)]
        
        away_data = all_performances[(all_performances['team_id'] == team_ids['away']) &
                                     (all_performances['match_date'].apply(lambda x: x.replace('Z', '')) < match.localDate)]

    team_data = {
        'home':home_data,
        'away':away_data
    }

    if all_squads is None:
        home_squad, away_squad = get_squads(match)
    else:
        home_squad, away_squad = all_squads[team_ids['home']], all_squads[team_ids['away']]

    squad_data = {
        'home':home_squad,
        'away':away_squad
    }

    for team in ['home', 'away']:
        opposite_position = 'away' if team == 'home' else 'home'

        team_stats_vector = _get_team_stats_vector(team_data[team],
                                                      team_data[opposite_position],
                                                      team)
        
        team_squad = squad_data[team]

        if all_player_data is None:
            historical_player_data = get_historic_player_data(match, list(team_squad.id.unique()))
        else: 
            historical_player_data = all_player_data[(all_player_data['playerId'].isin(team_squad.id.unique())) &
                                                      (all_player_data['match_date'].apply(lambda x: x.replace('Z', '')) < match.localDate)]
            
        # team_squad = team_squad[team_squad['id'].isin(historical_player_data.playerId.unique())]
        team_squad = all_player_data[(all_player_data['team_id'] == team_ids[team]) & (all_player_data['match_id'] == match.id)]

        for i, player in team_squad.iterrows():

            final_dict = {
                'player_id':player.playerId,
                'player_name':player.matchName,
                'player_position':player.position,
                'match_id':match.id,
                'match_date':match.localDate,
                'match_week':match.week
            }

            # print('player id', player.playerId)

            player_performances = historical_player_data[historical_player_data['playerId'] == player.playerId]
            player_vector = _get_player_stats(player_performances,
                                              team)
            
            final_dict.update(player_vector)
            final_dict.update(team_stats_vector)          

            if training == True:
                try:
                    mask = (all_player_data['match_id'] == match.id) & (all_player_data['playerId'] == player.playerId)
                    final_dict['target'] = all_player_data[mask].iloc[0].totalPass
                except IndexError as e:
                    continue
        
            # player_vector['match_id'] = match.id
            # player_vector['match_date'] = match.localDate
            # player_vector['match_week'] = match.week

            final_feat_list.append(final_dict)
        
    return final_feat_list

## feature generation

In [None]:
import pickle
team_data = pd.read_csv('team_data.csv')
team_data = team_data.sort_values(['match_date'], ascending=True).reset_index(drop=True)

player_data = pd.read_csv('player_data.csv')
player_data = player_data.sort_values(['match_date', 'match_id', 'team_id', 'playerId'], ascending=True).reset_index(drop=True)

with open('squads.pickle', 'rb') as fila:
    all_squads = pickle.load(fila)



list_of_all_vectors = []

for i, match in prem_loader.matches.iterrows():

    vectors = get_match_vectors(match,
                    all_performances=team_data,
                    all_squads=all_squads,
                    all_player_data=player_data,
                    training=True)
                    

    list_of_all_vectors.extend(vectors)

feature_list = pd.DataFrame(list_of_all_vectors)
feature_list.to_csv('./v1features.csv')

# modeling

In [None]:
features = pd.read_csv('v1features.csv', index_col=0)
features = features.dropna()
features = features.sort_values('match_date', ascending=True)

weeks = features.match_week.unique()
test_split = int(len(weeks) * 0.8)

train_weeks = weeks[:test_split]
test_weeks = weeks[test_split:]

train = features[features['match_week'].isin(train_weeks)]
test = features[features['match_week'].isin(test_weeks)]


config_cols = ['player_name', 'match_id', 'match_date', 'match_week', 'player_id']
feats = train.drop(columns=config_cols + ['target']).columns
target = ['target']

## light gbm

In [None]:
# from models.lightgbm.lgb import LGBMOrdinal

# model = LGBMOrdinal(num_leaves=24, max_depth=2, n_estimators=1500, learning_rate=0.001)

# model.fit(train[feats], train[target])
# probs = model.predict_proba(test[feats])

## alexboost

In [None]:
from alexboost import Normal

In [None]:
norm = Normal(
        num_trees=50, 
        num_samples_per_bucket=30,
    )


norm.fit(train[feats], train[target].target)

In [None]:
preds = norm.predict(test[feats])
test[['exp_mean', 'exp_std']] = preds

In [None]:
import seaborn as sns # Often imported as sns

# test.exp_std.plot()
sns.scatterplot(
    data=test,
    x='exp_std',
    y='exp_std',
    hue=''
)

In [None]:
norm.regressor.feature_importances_
pd.DataFrame({"feature": train[feats].columns, "importance": norm.regressor.feature_importances_}).sort_values(by="importance", ascending=False).round(3)

In [None]:
import matplotlib.pyplot as plt
train[target].target.hist()

In [None]:
train.sort_values('target', ascending=True)