# Scout AI

Where talent knows no borders, and scouting knows no limits

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [196]:
#These files are hosted on our project github repo, they are datasets that we scraped from basketball-realgm.com
#they contain 23 average player statistics across the 2023-2024 NBA season split by position (Center, Power forward, Point Guard, Small forward, Shooting Guard)
#basketball-realgm.com has been collecting and providing basketball data for more than 20 years now.
centerURL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/c.csv'
pfURL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/pf.csv'
pgURL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/pg.csv'
sfURL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/sf.csv'
sgURL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/sg.csv'
# remove nbaStatsURL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/nbaStats.csv'

#These datasets where scraped from the NBA 2k position ratings page, we found that 2k rankings can have a direct impact on a players career and are constantly
#revised and scrutinized because there is so much at stake for a player who is unfairly ranked and as such would provide a very sound ordering of our target variable.
twokC_URL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/2kC.csv'
twokPF_URL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/2kPF.csv'
twokSG_URL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/2kSG.csv'
twokSF_URL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/2kSF.csv'
twokPG_URL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/2kPG.csv'

#this dataset was also from basketball-realgm.com bus is a compilation of international players across the 29 largest basketball leagues across South America, Europe, Asia
internationalURL = 'https://raw.githubusercontent.com/ryanzambrano/scout-ai/main/model/international_players_copy.csv'

In [None]:
#Dataframes make working with datasets much easier, we will be using pandas to load are csv files into the colab.
CenterDF = pd.read_csv(centerURL)
PowerForwardDF = pd.read_csv(pfURL)
ShootingGuardDF = pd.read_csv(sgURL)
PointGuardDF = pd.read_csv(pgURL)
SmallForwardDF = pd.read_csv(sfURL)
#Printing one of the dataframes shows that this dataframe is encoding our csv correctly and we can proceed to digging into the data
print(CenterDF)

     #                Player Team  GP   MPG   PPG   FGM   FGA    FG%  3PM  \
0    1           Joel Embiid  PHI  34  34.0  35.3  11.8  22.2  0.533  1.2   
1    2          Nikola Jokic  DEN  50  33.9  26.3  10.3  17.6  0.584  1.1   
2    3         Anthony Davis  LAL  49  36.0  24.8   9.5  17.2  0.553  0.3   
3    4        Alperen Sengun  HOU  51  32.4  21.3   8.5  15.7  0.541  0.5   
4    5     Victor Wembanyama  SAS  46  28.4  20.3   7.5  16.1  0.464  1.5   
5    6    Kristaps Porzingis  BOS  38  29.8  20.2   6.8  12.9  0.528  1.8   
6    7           Bam Adebayo  MIA  42  34.5  20.2   7.6  15.0  0.509  0.0   
7    8      Domantas Sabonis  SAC  50  35.6  19.9   8.0  12.9  0.620  0.5   
8    9        Nikola Vucevic  CHI  47  34.2  17.3   7.4  15.6  0.475  1.1   
9   10          Myles Turner  IND  50  27.3  17.0   6.2  12.0  0.515  1.4   
10  11         Chet Holmgren  OKC  51  30.2  16.9   6.4  12.0  0.535  1.7   
11  12         Jarrett Allen  CLE  45  30.5  15.4   6.3   9.8  0.641  0.0   

In [197]:
#same as above except for the 2k dataset
twokC_df = pd.read_csv(twokC_URL)
twokPF_df = pd.read_csv(twokPF_URL)
twokSG_df = pd.read_csv(twokSG_URL)
twokSF_df = pd.read_csv(twokSF_URL)
twokPG_df = pd.read_csv(twokPG_URL)

In [198]:
twokPG_df = twokPG_df.drop(columns = ['Unnamed: 5', 'Unnamed: 6'])

In [199]:
#Parse the player column of the 2K dataset to get rid of any unnecesary information about the player and only keep their name
twokC_df['Player'] = twokC_df['Player'].apply(lambda x: ' '.join(x.split(' ', 2)[:2]))
twokPF_df['Player'] = twokPF_df['Player'].apply(lambda x: ' '.join(x.split(' ', 2)[:2]))
twokSG_df['Player'] = twokSG_df['Player'].apply(lambda x: ' '.join(x.split(' ', 2)[:2]))
twokSF_df['Player'] = twokSF_df['Player'].apply(lambda x: ' '.join(x.split(' ', 2)[:2]))
twokPG_df['Player'] = twokPG_df['Player'].apply(lambda x: ' '.join(x.split(' ', 2)[:2]))

In [200]:
#starting with the center position, lets merge our ranking dataset with our statistics dataset, drop columns that arent encoding anything useful
#as well as any statistics from our 2k dataset and see if we can find any correlation.
merged_Cdf = pd.merge(twokC_df, CenterDF, on='Player', how='inner')
merged_Cdf = merged_Cdf.drop(columns=['Player', 'OVR', '3PT', 'DNK', '#_y', 'Team'])

#define our features by dropping the target variable and saving to another dataframe
features = merged_Cdf.drop(columns=['#_x'])
#similiarly our target is just our first column
target = merged_Cdf['#_x']

# Normalize the features so that units of features are not skewing any correlation we find
#the min max scaler will use the minimum and maximum values in a column to scale a value respective to all the other values in the dataset
#this provides a more even comparison. MinMax scalers can be sensitive to outliers but in a game like basketball where there are people actively
#trying to stop a player from scoring, a player with exceptional stats deserves to be celebrated.
scaler = MinMaxScaler()
features_normalized = scaler.fit_transform(features)
features_df = pd.DataFrame(features_normalized, columns=features.columns)

#correlate all of our features with rank for this position using pandas.corrwith()
rank_correlations = features_df.corrwith(target, method='pearson')

top_N = 5 # Set the number of top features

#this might not be immediately clear but we are trying to find an inverse relationship since we want to minimize rank since a better rank is a smaller number
#therefore a strong negative correlation is what we want and can be found using nsmallest()
top_N_correlations_C = rank_correlations.nsmallest(top_N)

print(top_N_correlations_C)



MPG   -0.913178
DRB   -0.897894
RPG   -0.873599
FGM   -0.862442
PPG   -0.830803
dtype: float64


In [201]:
#Since the correlation was very strong, lets test for significance
from scipy.stats import pearsonr

# Calculate Pearson correlation coefficient and p-value for each feature
for feature in top_N_correlations_C.index:
    corr_coef, p_value = pearsonr(features_df[feature], target)
    print(f"Feature: {feature}")
    print(f"Pearson correlation coefficient: {corr_coef}")
    print(f"P-value: {p_value}")
    print("")

    # Determine significance based on p-value
    if p_value < 0.05:
        print("Correlation coefficient is statistically significant.")
    else:
        print("Correlation coefficient is not statistically significant.")


Feature: MPG
Pearson correlation coefficient: -0.913177983167163
P-value: 8.67949954005584e-17

Correlation coefficient is statistically significant.
Feature: DRB
Pearson correlation coefficient: -0.8978943418579517
P-value: 1.7809963835041507e-15

Correlation coefficient is statistically significant.
Feature: RPG
Pearson correlation coefficient: -0.8735987617909234
P-value: 9.129065716640291e-14

Correlation coefficient is statistically significant.
Feature: FGM
Pearson correlation coefficient: -0.8624419657900717
P-value: 4.279621899787748e-13

Correlation coefficient is statistically significant.
Feature: PPG
Pearson correlation coefficient: -0.830802528717401
P-value: 1.7975438269515243e-11

Correlation coefficient is statistically significant.


In [202]:
#Just for exploratory purposes lets see if we can fit a model onto this dataset to predict rank and try to find coefficients from it.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet Regression": ElasticNet(),
    "Support Vector Regression": SVR(kernel='linear'),
}


# Train and evaluate models
coefficients = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    coefficients[name] = model.coef_
    print(f"{name} MSE: {mse}")

# Print coefficients
for name, coef in coefficients.items():
    print(f"\n{name} Coefficients:")
    for feature, weight in zip(features_df.columns, coef):
        print(f"{feature}: {weight}")


Linear Regression MSE: 229.78471886875818
Ridge Regression MSE: 143.07622108783332
Lasso Regression MSE: 186.3819516285063
ElasticNet Regression MSE: 388.681735328133
Support Vector Regression MSE: 283.2395219493556

Linear Regression Coefficients:
GP: 0.7782479733744242
MPG: 30.970970941455096
PPG: -1313.7014986359743
FGM: 818.9702145451337
FGA: 16.432410669581557
FG%: -31.099773332342394
3PM: 91.44483207592538
3PA: -56.53737209102804
3P%: 9.687738568305493
FTM: 549.0565684363343
FTA: -125.39383019988377
FT%: -19.664747326530517
ORB: -37.459588711943965
DRB: -96.83996164182967
RPG: 58.59959187823102
APG: 5.831091152047785
SPG: -11.199313500596585
BPG: -5.532199681429039
TOV: 26.250688970527886
PF: -16.02000160259216

Ridge Regression Coefficients:
GP: -7.3826363133482555
MPG: -8.412274194469108
PPG: -5.532179044437101
FGM: -7.513967226712218
FGA: -6.657219324412943
FG%: -4.342576319577295
3PM: -4.493708326675359
3PA: -4.872592603497939
3P%: 5.575556406596916
FTM: -0.7013921611186151
F

I noticed that the mean squared error is very high for all the models indicating that the models are a bad fit for the data, since the linear correlation worked well with finding important features I will repeat finding correlations for all positions.

In [203]:
#Find correlation coefficients for point forward position
merged_PFdf = pd.merge(twokPF_df, PowerForwardDF, on='Player', how='inner')
merged_PFdf = merged_PFdf.drop(columns=['Player', 'OVR', '3PT', 'DNK', '#_y', 'Team'])

features = merged_PFdf.drop(columns=['#_x'])
target = merged_PFdf['#_x']
# Normalize the features
scaler = MinMaxScaler()
features_normalized = scaler.fit_transform(features)
features_df = pd.DataFrame(features_normalized, columns=features.columns)

rank_correlations = features.corrwith(target, method='pearson')
top_N = 23  # Set the number of top features
top_N_correlations_PF = rank_correlations.nsmallest(top_N)
print(top_N_correlations_PF)


FGM   -0.880608
MPG   -0.874855
PPG   -0.872140
FGA   -0.856958
TOV   -0.804991
FTM   -0.800538
FTA   -0.776557
APG   -0.748916
FG%   -0.709752
RPG   -0.705970
DRB   -0.683017
PF    -0.593583
SPG   -0.564934
ORB   -0.496339
GP    -0.420148
BPG   -0.405728
3PA   -0.256632
3PM   -0.231898
FT%   -0.178418
3P%   -0.049081
dtype: float64


In [204]:
#Since the correlation was very strong, lets test for significance
from scipy.stats import pearsonr

# Calculate Pearson correlation coefficient and p-value for each feature
for feature in top_N_correlations_PF.index:
    corr_coef, p_value = pearsonr(features_df[feature], target)
    print(f"Feature: {feature}")
    print(f"Pearson correlation coefficient: {corr_coef}")
    print(f"P-value: {p_value}")
    print("")

    # Determine significance based on p-value
    if p_value < 0.05:
        print("Correlation coefficient is statistically significant.")
    else:
        print("Correlation coefficient is not statistically significant.")


Feature: FGM
Pearson correlation coefficient: -0.880608448369269
P-value: 2.9721729758165445e-09

Correlation coefficient is statistically significant.
Feature: MPG
Pearson correlation coefficient: -0.8748548829599283
P-value: 5.0695056976106595e-09

Correlation coefficient is statistically significant.
Feature: PPG
Pearson correlation coefficient: -0.87213958604261
P-value: 6.463974539734508e-09

Correlation coefficient is statistically significant.
Feature: FGA
Pearson correlation coefficient: -0.8569582535596498
P-value: 2.2891356629380755e-08

Correlation coefficient is statistically significant.
Feature: TOV
Pearson correlation coefficient: -0.8049905344376364
P-value: 7.099283251206659e-07

Correlation coefficient is statistically significant.
Feature: FTM
Pearson correlation coefficient: -0.8005377393372007
P-value: 9.081314698469948e-07

Correlation coefficient is statistically significant.
Feature: FTA
Pearson correlation coefficient: -0.7765572508375693
P-value: 3.10216995658

In [205]:
#Find correlation for Shooting Guard
merged_SGdf = pd.merge(twokSG_df, ShootingGuardDF, on='Player', how='inner')
merged_SGdf = merged_SGdf.drop(columns=['Player', 'OVR', '3PT', 'DNK', '#_y', 'Team'])

features = merged_SGdf.drop(columns=['#_x'])
target = merged_SGdf['#_x']
# Normalize the features
scaler = MinMaxScaler()
features_normalized = scaler.fit_transform(features)
features_df = pd.DataFrame(features_normalized, columns=features.columns)

rank_correlations = features.corrwith(target, method='pearson')
top_N = 23  # Set the number of top features
top_N_correlations_SG = rank_correlations.nsmallest(top_N)
print(top_N_correlations_SG)

PPG   -0.889287
FGM   -0.875528
FGA   -0.864955
APG   -0.787698
FTM   -0.784775
FTA   -0.779293
MPG   -0.775710
TOV   -0.764127
3PA   -0.758201
DRB   -0.712366
RPG   -0.712079
3PM   -0.685848
SPG   -0.559554
ORB   -0.425797
FG%   -0.409400
PF    -0.367934
BPG   -0.325789
FT%   -0.300605
GP    -0.112561
3P%   -0.078712
dtype: float64


In [206]:
#Since the correlation was very strong, lets test for significance
from scipy.stats import pearsonr

# Calculate Pearson correlation coefficient and p-value for each feature
for feature in top_N_correlations_SG.index:
    corr_coef, p_value = pearsonr(features_df[feature], target)
    print(f"Feature: {feature}")
    print(f"Pearson correlation coefficient: {corr_coef}")
    print(f"P-value: {p_value}")
    print("")

    # Determine significance based on p-value
    if p_value < 0.05:
        print("Correlation coefficient is statistically significant.")
    else:
        print("Correlation coefficient is not statistically significant.")


Feature: PPG
Pearson correlation coefficient: -0.8892871788422254
P-value: 5.112473639859426e-11

Correlation coefficient is statistically significant.
Feature: FGM
Pearson correlation coefficient: -0.8755282655761659
P-value: 2.413378883134695e-10

Correlation coefficient is statistically significant.
Feature: FGA
Pearson correlation coefficient: -0.8649546634503728
P-value: 7.060061826619139e-10

Correlation coefficient is statistically significant.
Feature: APG
Pearson correlation coefficient: -0.787698069003309
P-value: 2.3940202742129913e-07

Correlation coefficient is statistically significant.
Feature: FTM
Pearson correlation coefficient: -0.7847750961196669
P-value: 2.842978757586085e-07

Correlation coefficient is statistically significant.
Feature: FTA
Pearson correlation coefficient: -0.7792926473360531
P-value: 3.8971701046987197e-07

Correlation coefficient is statistically significant.
Feature: MPG
Pearson correlation coefficient: -0.7757099529072429
P-value: 4.7664966926

In [207]:
#Find correlation coefficients for Small Forward
merged_SFdf = pd.merge(twokSF_df, SmallForwardDF, on='Player', how='inner')
merged_SFdf = merged_SFdf.drop(columns=['Player', 'OVR', '3PT', 'DNK', '#_y', 'Team'])

features = merged_SFdf.drop(columns=['#_x'])
target = merged_SFdf['#_x']
# Normalize the features
scaler = MinMaxScaler()
features_normalized = scaler.fit_transform(features)
features_df = pd.DataFrame(features_normalized, columns=features.columns)

rank_correlations = features.corrwith(target, method='pearson')

top_N = 23 # Set the number of top features
top_N_correlations_SF = rank_correlations.nsmallest(top_N)
print(top_N_correlations_SF)

FGM   -0.884041
FGA   -0.879003
PPG   -0.878281
MPG   -0.870286
TOV   -0.856269
RPG   -0.741996
APG   -0.729655
FTM   -0.723060
FTA   -0.719796
DRB   -0.710230
PF    -0.548747
SPG   -0.518699
ORB   -0.469617
FG%   -0.372310
3PA   -0.341241
FT%   -0.329629
3PM   -0.291282
BPG   -0.278433
GP     0.006407
3P%    0.034640
dtype: float64


In [208]:
from scipy.stats import pearsonr

# Calculate Pearson correlation coefficient and p-value for each feature
for feature in top_N_correlations_SF.index:
    corr_coef, p_value = pearsonr(features_df[feature], target)
    print(f"Feature: {feature}")
    print(f"Pearson correlation coefficient: {corr_coef}")
    print(f"P-value: {p_value}")
    print("")

    # Determine significance based on p-value
    if p_value < 0.05:
        print("Correlation coefficient is statistically significant.")
    else:
        print("Correlation coefficient is not statistically significant.")


Feature: FGM
Pearson correlation coefficient: -0.8840410150030639
P-value: 9.453127637875903e-11

Correlation coefficient is statistically significant.
Feature: FGA
Pearson correlation coefficient: -0.8790025171098284
P-value: 1.66027063365975e-10

Correlation coefficient is statistically significant.
Feature: PPG
Pearson correlation coefficient: -0.8782813062072766
P-value: 1.7959945077123483e-10

Correlation coefficient is statistically significant.
Feature: MPG
Pearson correlation coefficient: -0.870285527102918
P-value: 4.157611618099029e-10

Correlation coefficient is statistically significant.
Feature: TOV
Pearson correlation coefficient: -0.8562692294966627
P-value: 1.5974111082502354e-09

Correlation coefficient is statistically significant.
Feature: RPG
Pearson correlation coefficient: -0.7419962434702375
P-value: 2.693758773896336e-06

Correlation coefficient is statistically significant.
Feature: APG
Pearson correlation coefficient: -0.7296551171757057
P-value: 4.76110105267

In [209]:
#Find correlation coefficients for Point Guard
merged_PGdf = pd.merge(twokPG_df, PointGuardDF, on='Player', how='inner')
merged_PGdf = merged_PGdf.drop(columns=['Player', 'OVR', '3PT', 'DNK', '#_y', 'Team'])

features = merged_PGdf.drop(columns=['#_x'])
target = merged_PGdf['#_x']
# Normalize the features
scaler = MinMaxScaler()
features_normalized = scaler.fit_transform(features)
features_df = pd.DataFrame(features_normalized, columns=features.columns)

rank_correlations = features.corrwith(target, method='pearson')
#threshold = 0.3  # Set your desired threshold
#positive_rank_correlations = rank_correlations[rank_correlations > threshold]
top_N = 23  # Set the number of top features
top_N_correlations_PG = rank_correlations.nsmallest(top_N)
print(top_N_correlations_PG)

MPG   -0.908226
FGM   -0.887656
PPG   -0.884572
FGA   -0.884141
APG   -0.821296
SPG   -0.793938
3PA   -0.793662
3PM   -0.782228
FTA   -0.781092
FTM   -0.778830
TOV   -0.769541
DRB   -0.746767
RPG   -0.732244
BPG   -0.541242
PF    -0.458776
FG%   -0.454682
ORB   -0.320361
FT%   -0.298991
3P%   -0.184766
GP    -0.047399
dtype: float64


In [210]:
#Since the correlation was very strong, lets test for significance
from scipy.stats import pearsonr

# Calculate Pearson correlation coefficient and p-value for each feature
for feature in top_N_correlations_SG.index:
    corr_coef, p_value = pearsonr(features_df[feature], target)
    print(f"Feature: {feature}")
    print(f"Pearson correlation coefficient: {corr_coef}")
    print(f"P-value: {p_value}")
    print("")

    # Determine significance based on p-value
    if p_value < 0.05:
        print("Correlation coefficient is statistically significant.")
    else:
        print("Correlation coefficient is not statistically significant.")


Feature: PPG
Pearson correlation coefficient: -0.8845718533073301
P-value: 3.7363870983181317e-14

Correlation coefficient is statistically significant.
Feature: FGM
Pearson correlation coefficient: -0.8876562366082391
P-value: 2.296333998801916e-14

Correlation coefficient is statistically significant.
Feature: FGA
Pearson correlation coefficient: -0.8841414069433654
P-value: 3.994654838036738e-14

Correlation coefficient is statistically significant.
Feature: APG
Pearson correlation coefficient: -0.8212956189725366
P-value: 8.46041596106413e-11

Correlation coefficient is statistically significant.
Feature: FTM
Pearson correlation coefficient: -0.7788298464080448
P-value: 3.260737724515608e-09

Correlation coefficient is statistically significant.
Feature: FTA
Pearson correlation coefficient: -0.7810915506118575
P-value: 2.740450039186685e-09

Correlation coefficient is statistically significant.
Feature: MPG
Pearson correlation coefficient: -0.908225834375778
P-value: 5.920722965018

In [211]:
#Now that we have correlation coefficients that we beleive are important for each position lets load the international athletes dataset and give them a rating based on the most important features
prospects_df = pd.read_csv(internationalURL)
copyProspects = prospects_df.drop(columns = ['Player', 'Team', 'League', 'Position', 'MPG'])

In [212]:
scaler = MinMaxScaler()
columns_to_normalize = copyProspects.columns[1:]  # Exclude non-feature columns
copyProspects[columns_to_normalize] = scaler.fit_transform(copyProspects[columns_to_normalize])

# Calculate 'CenterRating' based on a linear combination of four features
# Adjust coefficients and features as per your requirement, if you beleive you have insights that we missed.
copyProspects['CenterRating'] = (
    copyProspects['DRB'] * 0.25 +
    copyProspects['RPG'] * 0.25 +
    copyProspects['FGM'] * 0.25 +
    copyProspects['FGA'] * 0.25
) * 100

copyProspects['PowerForwardRating'] = (
    copyProspects['FGM'] * 0.25 +
    copyProspects['RPG'] * 0.25 +
    copyProspects['TOV'] * 0.25 +
    copyProspects['FGA'] * 0.25
) * 100

copyProspects['ShootingGuardRating'] = (
    copyProspects['FGM'] * 0.25 +
    copyProspects['FGA'] * 0.25 +
    copyProspects['APG'] * 0.25 +
    copyProspects['TOV'] * 0.25
) * 100

copyProspects['SmallForwardRating'] = (
    copyProspects['FGM'] * 0.25 +
    copyProspects['TOV'] * 0.25 +
    copyProspects['APG'] * 0.25 +
    copyProspects['FGA'] * 0.25
) * 100

copyProspects['PointGuardRating'] = (
    copyProspects['FGM'] * 0.25 +
    copyProspects['3PA'] * 0.25 +
    copyProspects['APG'] * 0.25 +
    copyProspects['FGA'] * 0.25
) * 100

copyProspects['OverallRating'] = (
    copyProspects['CenterRating'] * 0.2 +
    copyProspects['PowerForwardRating'] * 0.2 +
    copyProspects['ShootingGuardRating'] * 0.2 +
    copyProspects['SmallForwardRating'] * 0.2 +
    copyProspects['PointGuardRating'] * 0.2
)


# Sort the DataFrame based on the 'OverallRating' column to see some of the best basketball players around the world
copyProspects = copyProspects.sort_values(by='OverallRating', ascending=False)



ratingsDF = copyProspects.iloc[:, 20:].copy()

ratedInternationals = ratingsDF.join(prospects_df)





In [216]:
#define scores for defend, inside (2pt, layup), outside (3pt), rebound, playmaking(assist), to generate a radar chart
ratedInternationals['inside'] = (prospects_df['FGM'] - prospects_df['3PM'])
ratedInternationals['outside'] = (prospects_df['3PM'])
ratedInternationals['playmaking'] = (prospects_df['APG'])
ratedInternationals['rebound'] = (prospects_df['ORB'] + prospects_df['DRB'])
ratedInternationals['defense'] = (prospects_df['SPG'] + prospects_df['BPG'])

features_to_scale = ['inside', 'outside', 'playmaking', 'rebound', 'defense']

# Initialize the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 100))  # Scale to range from 0 to 100

# Fit and transform the data
ratedInternationals[features_to_scale] = scaler.fit_transform(ratedInternationals[features_to_scale])

# Print the normalized scores
print(ratedInternationals)



      CenterRating  PowerForwardRating  ShootingGuardRating  \
1783     70.735887           72.130814            81.409613   
1723     71.243158           70.717796            72.291158   
1784     76.350059           73.795711            71.974087   
2507     83.947130           74.997855            67.941003   
2540     89.616087           73.728406            64.955442   
...            ...                 ...                  ...   
2717      2.092029            1.276812             0.783391   
2177      1.809735            1.556112             1.062691   
1680      0.980651            0.853839             0.760741   
2137      0.229358            1.062691             1.062691   
2413      0.717547            0.445808             0.681657   

      SmallForwardRating  PointGuardRating  OverallRating    id  \
1783           81.409613         85.738881      78.284962  1783   
1723           72.291158         83.957824      74.100219  1723   
1784           71.974087         76.313518

In [217]:
#download your file to use in a web app
from google.colab import files
ratedInternationals.to_csv('ratedInternationals.csv', encoding = 'utf-8')
files.download('ratedInternationals.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>