In [None]:
# Goal: If varsity (college) swimmers are (on average) taller than their volleyball counterparts
# Data: Baruch's, Brooklyn's, York's, and John Jay's volleyball and swimming teams
import requests
from bs4 import BeautifulSoup
import pandas as pd



In [None]:
#Lists of all urls in each category
MenVolleyball_url=["https://www.brooklyncollegeathletics.com/sports/mens-volleyball/roster/2019", "https://athletics.baruch.cuny.edu/sports/mens-volleyball/roster", "https://yorkathletics.com/sports/mens-volleyball/roster"]
MenSwimming_url=["https://www.brooklyncollegeathletics.com/sports/mens-swimming-and-diving/roster","https://athletics.baruch.cuny.edu/sports/mens-swimming-and-diving/roster", "https://yorkathletics.com/sports/mens-swimming-and-diving/roster"]
WomenVolleyball_url=["https://www.brooklyncollegeathletics.com/sports/womens-volleyball/roster/2019", "https://athletics.baruch.cuny.edu/sports/womens-volleyball/roster", "https://johnjayathletics.com/sports/womens-volleyball/roster"]
WomenSwimming_url=["https://www.brooklyncollegeathletics.com/sports/womens-swimming-and-diving/roster", "https://athletics.baruch.cuny.edu/sports/womens-swimming-and-diving/roster", "https://queensknights.com/sports/womens-swimming-and-diving/roster" ]

In [None]:
#Height function
def height (url):
  #Store content retrieved from the url into page, and use beautifulsoup package to parse HTML
  page = requests.get(url, verify=True)
  soup = BeautifulSoup(page.content, 'html.parser')
  #Find all span tags text with class of sidearm roster player height
  heightlist= soup.find_all('span', class_= "sidearm-roster-player-height") 
      
  list=[]
  #The loop will run and find height of all players and append them into a list
  for height in (heightlist):
    list.append(height.get_text())
    #Use list comprehension to split the list and retreive the first element in the list, which is the height in feet
    ft_list= [(num.split("'")[0]) for num in list]
    #Use list comprehension to split the list and retreive the second element in the list, which is the height in inches
    inch_list= [(num.split("'")[1]) for num in list]
    #Then, use list comprehension so there is just numbers in the list, no special charc.
    inch_list= [char.replace('"', '') for char in inch_list]
    #Use loop: for each number in the list, convert them to an int and multiply them to convert to cm 
    for num in range(0, len(ft_list)):
      ft_list[num]= int(ft_list[num])*30.48
      inch_list[num]= int(inch_list[num])*2.54

  #Add two cm list (ft_list & inch_list) together
  total_height= [ft_list+inch_list for ft_list, inch_list in zip (ft_list, inch_list)]
  #divide the list into halves because for some reason the soup.find_all returned all the heights twice, we only want first half
  total_height= total_height[:len(total_height)//2]
  return total_height


In [None]:
#Name function
def name (url):
  #Store content retrieved from the url into page, and use beautifulsoup package to parse HTML
  page = requests.get(url, verify=True)
  soup = BeautifulSoup(page.content, 'html.parser')
  #Find all span tags text with class of sidearm roster player first name, then last name 
  fname= soup.find_all('span', class_="sidearm-roster-player-first-name")
  lname= soup.find_all('span', class_="sidearm-roster-player-last-name")
      

  f_list=[] 
  fname_list=[]
  lname_list=[]
  #Two for loop to find all names of all players and append them into a list
  for name in fname:
    f_list.append(name.get_text())
  for name in lname:
    lname_list.append(name.get_text())
  #Another for loop for first name because some of the first names had an empty space at the end, while others don't. 
  #This messes up with adding it in full_name later on 
  for names in f_list:
    if names[-1]!= " ":
      fname_list.append(names+" ")
    else:
      fname_list.append(names)

  #Add the two lists together to have a full name
  full_name= [fname_list+ lname_list for fname_list, lname_list in zip (fname_list, lname_list)]
  return full_name

  

In [None]:
#Pandas dataframe function
def data_frame(full_name, total_height):
  #Convert full name list into pandas dataframe 
  name_df= pd.DataFrame(full_name)
  name_df= name_df.rename(columns={0: "Names"}) #rename column 0 into Names
  #Convert the total height into pandas dataframe
  height_df= pd.DataFrame(total_height)
  height_df= height_df.rename(columns={0: "Heights"}) #rename column 0 into Heights

  #Add the two dataframes together vertically 
  combined_df= pd.concat([name_df,height_df], axis=1)
  return combined_df.set_index('Names')




In [None]:
#Dataframe to merge the 3 schools dataframe into one
def concat_df(urls):
  #Use list comprehension to run each url in the websites list through the functions above
  combined_df = [data_frame(name(url), height(url)) for url in urls]
  #Merge all 3 dataframes into one
  total_df= pd.concat(combined_df)
  #Delete the rows in the dataframe with NaN values because they have no height indicating they're coaches
  #We only want players
  total_df= total_df.dropna()
  print(total_df)
  #Print out average
  print(total_df.mean())
  return total_df

In [None]:
MenVolley_df= concat_df(MenVolleyball_url)

                      Heights
Names                        
Snigdho Hasan          165.10
Michael Valentin       175.26
Andres Vargas          182.88
Jasper Diangco         177.80
Sayuj Zachariah        187.96
Omar Rezika            187.96
Gabriel Pjatak         190.50
Ryan Chabel            180.34
Utku Tanritanir        182.88
QiQin Zeng             175.26
Andrew Tsororos        193.04
Steven Lopez           182.88
Sonam Dorjee           177.80
Edward Grinberg        187.96
David Sirchenko        190.50
Stallone Shankar       182.88
Rabsang Andrugtsang    177.80
Hanbin Lee             175.26
Artem Zinkin           195.58
Michael Higgins        193.04
Carlos Rodriguez       193.04
Leon Petrovitsky       195.58
Defeng Han             185.42
Evan Takos             190.50
Meni Musheyev          190.50
Justin Iloulian        187.96
John Vitor             167.64
Juan Rodriguez         167.64
Shawn Nemoto           170.18
Erick Ortega           172.72
Akil Vaughn            195.58
Justin Saj

In [None]:
MenSwimming_df= concat_df(MenSwimming_url)

                      Heights
Names                        
Ronaldo Barrios        187.96
Devin Boodha           175.26
Nikita Cary            172.72
Alec Chi               177.80
Peter Ebert            187.96
Andwele Estwick        180.34
Omar Hasan- Hafez      185.42
Christian Hoyek        182.88
Konstantin Lomeyko     172.72
Mark MacEachen         193.04
James Rupa             177.80
Akmalijon Salimov      175.26
Gregory Becker         195.58
Nicholas Blanco        177.80
Kevin Chen             175.26
Gary Danilyan          187.96
Kasper Gacek           195.58
David Gazaryan         177.80
Matthew Greenfield     172.72
Julio Hernandez        172.72
Jonathan Jaramillo     177.80
Erik Kantar            175.26
Fernando Marroquin     170.18
David Matskovsky       187.96
Eddy Min               182.88
Eli Mogorichev         182.88
Jose Munoz Aycart      180.34
Evan Nikolic           190.50
Dayan Ordabayev        177.80
Luca Rivitti           177.80
Ben Sionov             165.10
Julian Tin

In [None]:
WomenVolley_df= concat_df(WomenVolleyball_url)

                          Heights
Names                            
Inesa Shimcani             165.10
Ashley Fung                170.18
Rain Zhang                 167.64
Yana Shaposhnikova         160.02
Ezri Shor                  165.10
Madison Cronk              180.34
Laurie Lee                 157.48
Nohelia Cedeno             172.72
Alejandra Chen             170.18
Savanna Koutsakis-Keener   154.94
Alanis Guinada             162.56
Megan Ortiz-Mengedoht      177.80
Kate Meltsin               162.56
Yijia Cai                  177.80
Jasmin Vidal Ventura       157.48
Stella Chung               167.64
Claudia Daporta            182.88
Anita Sengara              180.34
Young Gi Go                152.40
Wai Chang                  162.56
Anesa Feratovic            167.64
Claudia Moi                165.10
Sanaa Hadjeb               170.18
Cindy Cui                  162.56
Cady Xia                   154.94
Ada Cao                    167.64
Katherine Chen             170.18
Stephany Chhay

In [None]:
WomenSwimming_df= concat_df(WomenSwimming_url)

                      Heights
Names                        
Semona Bardman         167.64
Angella Christopher    170.18
Anna Curran            160.02
Rachel Davidovich      162.56
Shenaika Eliassaint    180.34
Julie Huang            154.94
Natalia Mrzyglod       165.10
Stella Ryklis          167.64
Alyssa Taylor          175.26
Emily Yang             157.48
Stephanie Antonova     154.94
Nicole Astudillo       157.48
Andrea Gasic           175.26
Onika George           177.80
Karina Gotz            170.18
Asimina Hamakiotes     175.26
Mirtho-Myra Lamonier   154.94
Amanda Lee             152.40
Iris Leung             160.02
Debbie Li              160.02
Megan Liew             162.56
Ashley Louke           162.56
Nicole Ousmanova       167.64
Leslibeth Romero       170.18
Alicia Soares          154.94
Nicole Tarasiuk        170.18
Gloria Wu              160.02
Gabriela Almeida       160.02
Marin Azhar            154.94
Alyssa Budzynski       172.72
Brigitti Cruz          167.64
Danielle D

In [None]:
#List of all four dataframes
df_list= [MenVolley_df, MenSwimming_df, WomenVolley_df, WomenSwimming_df]
#List of all headers for each dataframe
headers= ["Top 5 Shortest Men Volleyball Players","Top 5 Tallest Men Volleyball Players", "Top 5 Shortest Men Swimmers","Top 5 Tallest Men Swimmers", "Top 5 Shortest Women Volleyball Players","Top 5 Tallest Women Volleyball Players", "Top 5 Shortest Women Swimmers", "Top 5 Tallest Women Swimmers"]
count=0
#For each dataframe, we want to sort it by height, ascending order.
for df in df_list:
  df.sort_values(by = "Heights", inplace= True)
  #Iterate through each header using count variable 
  print(headers[count],'\n')
  #returns a dataframe of top shortest (top 5 in dataframe) players
  print(df.head(), '\n') 
  count+=1 
  print(headers[count], '\n')
  #returns a dataframe of top tallest players (bottom 5 in dataframe )
  print(df.tail(), '\n')
  count+=1
  
 

Top 5 Shortest Men Volleyball Players 

                   Heights
Names                     
Jayden Deokinanan   162.56
Snigdho Hasan       165.10
Juan Rodriguez      167.64
John Vitor          167.64
Shawn Nemoto        170.18 

Top 5 Tallest Men Volleyball Players 

                     Heights
Names                       
Guillermo Hernandez   193.04
Michael Higgins       193.04
Leon Petrovitsky      195.58
Akil Vaughn           195.58
Artem Zinkin          195.58 

Top 5 Shortest Men Swimmers 

                    Heights
Names                      
Ben Sionov           165.10
Fernando Marroquin   170.18
Joseph Vasile        170.18
Christopher Mendez   170.18
Anthony Jordan       170.18 

Top 5 Tallest Men Swimmers 

                Heights
Names                  
Morgan Welling   190.50
Evan Nikolic     190.50
Mark MacEachen   193.04
Kasper Gacek     195.58
Gregory Becker   195.58 

Top 5 Shortest Women Volleyball Players 

                          Heights
Names                 