In [None]:
import pandas as pd
import numpy as np
from scipy import sparse
from scipy.sparse.linalg import eigs

# PageRank parameters
DAMPING_FACTOR = 0.95  # Standard PageRank damping factor
HOME_ADVANTAGE = 70    # Points for home court advantage
MARGIN_WEIGHT = 0.7    # Power for margin of victory (diminishing returns)
BASE_RATING = 1500     # Base rating (similar to Elo)


def pagerank_basketball_rankings(games_file='games_2022.csv', 
                                teams_regions_file='Team Region Groups.csv'):
    """Implementation of PageRank algorithm for basketball team ranking"""
    # Load data
    games = pd.read_csv(games_file)
    teams_regions = pd.read_csv(teams_regions_file)
    
    # Add east teams to teams_regions
    east_games = pd.read_csv('East Regional Games to predict.csv')
    east_teams = pd.concat([east_games['team_home'], east_games['team_away']]).unique()
    east_teams = pd.DataFrame({'team': east_teams, 'region': 'East'})
    teams_regions = pd.concat([teams_regions, east_teams]).drop_duplicates('team')
    
    # Process game data chronologically
    games['game_date'] = pd.to_datetime(games['game_date'])
    games = games.sort_values('game_date')
    
    # Build list of all teams
    all_teams = set(games['team'].unique()).union(set(teams_regions['team']))
    teams_list = list(all_teams)
    team_indices = {team: i for i, team in enumerate(teams_list)}
    n_teams = len(teams_list)
    
    # Initialize adjacency matrix (sparse to handle large number of teams)
    adjacency = sparse.lil_matrix((n_teams, n_teams), dtype=np.float64)
    
    # Track games played
    games_played = np.zeros(n_teams)
    
    # Process games to build the network
    for game_id, game_group in games.groupby('game_id'):
        if len(game_group) != 2:
            continue  # Skip invalid games
            
        # Extract game data
        team_a_row = game_group.iloc[0]
        team_b_row = game_group.iloc[1]
        
        team_a = team_a_row['team']
        team_b = team_b_row['team']
        
        # Track games played for each team
        idx_a = team_indices[team_a]
        idx_b = team_indices[team_b]
        games_played[idx_a] += 1
        games_played[idx_b] += 1
        
        score_a = team_a_row['team_score']
        score_b = team_b_row['team_score']
        
        # Apply home court adjustment
        home_a = team_a_row['home_away_NS'] == 1
        home_b = team_b_row['home_away_NS'] == 1
        
        adj_score_a = score_a
        adj_score_b = score_b
        
        if home_a:
            adj_score_a = score_a - HOME_ADVANTAGE/2
        if home_b:
            adj_score_b = score_b - HOME_ADVANTAGE/2
        
        # Determine winner and margin
        if adj_score_a > adj_score_b:
            winner, loser = idx_a, idx_b
            margin = adj_score_a - adj_score_b
        else:
            winner, loser = idx_b, idx_a
            margin = adj_score_b - adj_score_a
        
        # Edge weight with diminishing returns for blowouts
        weight = np.power(margin, MARGIN_WEIGHT)
        
        # Add directed edge from loser to winner
        adjacency[loser, winner] += weight
    
    # Convert to CSR format for efficient operations
    adjacency = adjacency.tocsr()
    
    # Calculate column sums (outgoing weights from each team)
    col_sums = np.array(adjacency.sum(axis=0)).flatten()
    
    # Handle teams with no losses
    col_sums[col_sums == 0] = 1.0
    
    # Create transition probability matrix
    transition = adjacency.copy()
    for i in range(n_teams):
        if col_sums[i] > 0:
            transition[:, i] = transition[:, i] / col_sums[i]
    
    # Create Google matrix
    teleport = np.ones(n_teams) / n_teams
    G = DAMPING_FACTOR * transition
    
    # Add teleportation component
    for i in range(n_teams):
        G[:, i] = G[:, i].toarray().flatten() + (1 - DAMPING_FACTOR) * teleport
    
    # Compute principal eigenvector (PageRank)
    eigenvalues, eigenvectors = eigs(G.T, k=1, which='LM')
    pagerank = np.real(eigenvectors[:, 0])
    
    # Normalize
    pagerank = pagerank / np.sum(pagerank)
    
    # Adjust values for teams with few games (regression to mean)
    min_games = 5
    mean_pagerank = np.mean(pagerank)
    for i in range(n_teams):
        if games_played[i] < min_games:
            adjustment = (min_games - games_played[i]) / min_games
            pagerank[i] = pagerank[i] * (1 - adjustment) + mean_pagerank * adjustment
    
    # Create result DataFrame
    rankings = pd.DataFrame({
        'Team': teams_list,
        'PageRank': pagerank,
        'Rating': BASE_RATING + 500 * (pagerank - np.min(pagerank)) / 
                 (np.max(pagerank) - np.min(pagerank))
    })
    
    # Add region information
    rankings = pd.merge(rankings, teams_regions, left_on='Team', right_on='team', how='left')
    rankings = rankings.drop(columns=['team'])
    
    # Sort by rating
    rankings = rankings.sort_values('Rating', ascending=False).reset_index(drop=True)
    
    return rankings

def predict_east_games(rankings, east_games_file='East Regional Games to predict.csv'):
    """Predict East region game probabilities"""
    east_games = pd.read_csv(east_games_file)
    
    def predict_game(row):
        """Calculate win probability for home team"""
        home_team = row['team_home']
        away_team = row['team_away']
        
        # Get team ratings with fallback
        home_rating = rankings.loc[rankings['Team'] == home_team, 'Rating'].values[0] \
                     if home_team in rankings['Team'].values else BASE_RATING
        away_rating = rankings.loc[rankings['Team'] == away_team, 'Rating'].values[0] \
                     if away_team in rankings['Team'].values else BASE_RATING
        
        # Apply contextual adjustments (same as Elo model)
        rest_advantage = (row['rest_days_Home'] - row['rest_days_Away']) * 7.2
        travel_penalty = (row['travel_dist_Away']/300) * -2.8
        
        # Neutral site adjustment
        neutral_site_adj = 0 if row['home_away_NS'] == 0 else (70 * row['home_away_NS'])
        
        # Calculate final rating difference
        rating_diff = (home_rating - away_rating) + rest_advantage + travel_penalty + neutral_site_adj
        
        # Convert to probability using logistic function (same as Elo)
        win_prob = 1 / (1 + 10 ** (-rating_diff/400))
        
        return round(win_prob, 4)
    
    # Generate predictions
    east_games['WINNING %'] = east_games.apply(predict_game, axis=1)
    east_games['WINNING %'] = east_games['WINNING %'].clip(0.01, 0.99)  # Valid range
    
    return east_games[['game_id', 'team_home', 'team_away', 'WINNING %']]

# Main execution
if __name__ == "__main__":
    # Phase 1a: Generate team rankings
    print("Computing PageRank basketball rankings...")
    rankings = pagerank_basketball_rankings()
    
    # Print rankings by region
    for region in ['North', 'South', 'West', 'East']:
        region_ranks = rankings[rankings['region'] == region].copy()
        region_ranks['Rank'] = range(1, len(region_ranks) + 1)
        region_ranks['Rating'] = region_ranks['Rating'].round(0).astype(int)
        
        print(f"\n{region} Region Rankings (top 16):")
        print(region_ranks[['Team', 'Rating', 'Rank']].head(16))
    
    # Save rankings
    rankings.to_csv('pagerank_rankings.csv', index=False)
    
    # Phase 1b: Generate East region predictions
    print("\nGenerating East region game predictions...")
    predictions = predict_east_games(rankings)
    
    print("\nEast Region Predictions:")
    print(predictions)
    
    # Save predictions
    predictions.to_csv('pagerank_predictions.csv', index=False)


Computing PageRank basketball rankings...


  self._set_arrayXarray(i, j, x)



North Region Rankings (top 16):
                            Team  Rating  Rank
40              miami_hurricanes    1940     1
52       florida_state_seminoles    1933     2
63   georgia_tech_yellow_jackets    1919     3
87      south_carolina_gamecocks    1892     4
89         georgia_lady_bulldogs    1890     5
96     tennessee_lady_volunteers    1886     6
100                  ucf_knights    1884     7
105               florida_gators    1882     8
120          south_florida_bulls    1875     9
132              ole_miss_rebels    1868    10
136            tulane_green_wave    1862    11
141            davidson_wildcats    1857    12
144              houston_cougars    1854    13
150         alabama_crimson_tide    1852    14
155          arkansas_razorbacks    1848    15
163                   lsu_tigers    1832    16

South Region Rankings (top 16):
                          Team  Rating  Rank
18            indiana_hoosiers    1954     1
28          depaul_blue_demons    1949     2
