# This file web scraps players data and teams data

### First Installing the required modules
* `BeautifulSoup` for web scraping
* `pandas` for creating DataFrames to represent data in a structured format
* `requests` for sending http requests to the web site
* `json` to convert scraped data into json dictionary like format
* `time` to prevent our code from sending multiple requests at once and reduce the load on the server
* `functools` for `reduce()` function to allow for repeated list operations

**You will also be required to install *`openpyxl`* for converting DataFrames to excel file**

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
import time
from functools import reduce

### The url structure of [Pro Kabaddi Stats](https://www.prokabaddi.com/stats)

The url structure the website very simple. It is as follows : https://www.prokabaddi.com/stats/{season_id}-{stats_id}-a-statistics  

Every season has an **id** associated to it.  
&emsp; > For Example **Season 9 has id as 25**  

Every statistic type also has an **id** associated to it.  
&emsp;> For Example **Total Points Scored by a Player has id as 102** and **Total Points Scored by a Team has id as 96**  

So if we want to access the data of **Total Points Scored By a Team in Season 9**, then the url corresponding to that will be :  
(https://www.prokabaddi.com/stats/25-96-a-statistics)  

*```The url structure was understood by us manually after putting some hours into the website structure Pro Kabaddi```*  
<br>
The four dictionaries ```season_to_id, id_to_season, player_stats_to_id, teams_stats_to_id ``` represent just the information given above.



In [2]:


season_to_id={'Season 9': 25, 'Season 8': 20, 'Season 7': 11,
              'Season 6': 10, 'Season 5': 8,  'Season 4': 4,  
              'Season 3': 3,  'Season 2': 2,  'Season 1': 1,  
              'All Seasons': 0}
id_to_season={25: 'Season 9', 20: 'Season 8', 11: 'Season 7',
              10: 'Season 6', 8: 'Season 5',  4: 'Season 4',
              3: 'Season 3',  2: 'Season 2', 1: 'Season 1',
              0: 'All Seasons'}
player_stats_to_id={'total_points': 102, 'successful_raids': 21,
                    'raid_points': 22, 'successful_tackles': 23,
                    'tackle_points': 103, 
                    'do_or_die_raid_points': 132, 'super_raids': 104,
                    'super_tackles': 28, 'super_10s': 100, 'high_5s': 101
                    }
teams_stats_to_id={'total_points': 96, 'successful_raids': 13,
                   'raid_points': 97, 'successful_tackles': 15,
                   'tackle_points': 95,
                   'do_or_die_raid_points': 135, 'super_raids': 134,
                   'super_tackles': 20, 'total_points_conceded': 133,
                   'all_outs_inflicted': 136, 'all_out_conceded': 137
                   }
url='https://www.prokabaddi.com/stats/{0}-{1}-a-statistics'

In [None]:
all_seasons_df=[]  #will contain dataframes for each season 
for season_name, season_id in season_to_id.items():
    if season_name=='All Seasons':
        continue
    data_frames=[]     #will conatin dataframes for each stat_type for current season_name
    for stat_type, stat_id in player_stats_to_id.items():
        req=requests.get(url.format(season_id, stat_id))    #make request to the website
        soup=BeautifulSoup(req.content,"html.parser")   # parse the html
        new=soup.find_all('script')[4].string   #data is stored in the 5th script tag of the html page
        new=new[new.find('{'):]  #so as to only fetch the json object and not the variable name
        new2=json.loads(new)    # load the whole json object as python style dictionary
        file=new2["stats"]["data"]     #to only extract the relevant information
        for record in file:  # this step is done to clean the data, add relevant columns, and remove redundant columns
            record['season']=season_name
            record[stat_type]=record['value'] 
            del record['value']
            del record['team']
            del record['rank']
            del record['team_name']
            del record['position_id']
        data_frames.append(pd.DataFrame.from_dict(file))  # this give the dataframe for the current stat_type and season_name
        time.sleep(3)   # 3 seconds are set as idle so that web server load is reduced
    combined_data=reduce(lambda left, right: pd.merge(left, right, how='outer'), data_frames).fillna(0)     #combines all stat_types to give a single dataframe for the current season_name
    all_seasons_df.append(combined_data)    # adds to the list of season_wise dataframes
final_data_frame=pd.concat(all_seasons_df, ignore_index=True)   #combines all season dataframes to give a single dataframe

In [14]:
all_seasons_df=[]
for season_name, season_id in season_to_id.items():
    if season_name=='All Seasons':
        continue
    data_frames=[]
    for stat_type, stat_id in teams_stats_to_id.items():
        req=requests.get(url.format(season_id, stat_id))
        soup=BeautifulSoup(req.content, 'html.parser')
        new=soup.find_all('script')[4].string 
        new=new[new.find('{'):] 
        new2=json.loads(new)    
        file=new2["stats"]["data"]
        for record in file:
            record['season']=season_name
            record[stat_type]=record['value']
            del record['value']
            del record['rank']
        data_frames.append(pd.DataFrame.from_dict(file))
        time.sleep(1)
    combined_data=reduce(lambda left, right: pd.merge(left, right, how='outer'), data_frames).fillna(0)
    all_seasons_df.append(combined_data)
teams_data=pd.concat(all_seasons_df, ignore_index=True)       

### Storing our data into excel
```
This cell will write the dataframe into an excel file.
(openpyxl is used here)
```

In [None]:
final_data_frame.to_excel('players_data.xlsx', index=False)

In [16]:
teams_data.to_excel('teams_data.xlsx', index=False)

In [None]:
url2="https://feeds.prokabaddi.com/SI/{0}/Fixture.json"
url3="https://feeds.prokabaddi.com/SI/MatchCentre/{0}.json"
data_frames2=[]
for season_name, season_id in season_to_id.items():
    if season_name=='All Seasons':
        continue
    req=requests.get(url2.format(season_id))
    new=json.loads(req.content)
    file=new["matches"]
    for record in file:
        req2=requests.get(url3.format(record['game_id']))
        new_match=json.loads(req2.content)

        record['toss_winner_id']=new_match["match_detail"]["toss"]["winner"] 
        record['toss_choice']=new_match["match_detail"]["toss"]["selection"]
        if record['toss_choice']!='raid':
            record['toss_choice']='court'
            
        record['venue_id']=new_match["match_detail"]["venue"]["id"]
        record['venue_name']=new_match["match_detail"]["venue"]["name"]
        if "home_team_id" in new_match["teams"] and new_match['teams']['home_team_id']!="" and new_match['teams']['home_team_id']!="0" and new_match['teams']['home_team_id'] is not None:
            record["home_team_id"]=new_match["teams"]["home_team_id"]
            record['home_team_name']=new_match["teams"]["home_team_name"]
        else:
            record["home_team_id"]=0
            record['home_team_name']="None"



        record['season']=season_name
        
        record["team1_name"]=new_match['teams']["team"][0]["name"]
        record["team1_id"]=new_match['teams']["team"][0]["id"]
        record["team1_score"]=new_match['teams']["team"][0]["score"]

        record["team1_all_out_points"]=new_match['teams']["team"][0]["stats"]["points"]["all_out"]
        record["team1_extra_points"]=new_match['teams']["team"][0]["stats"]["points"]["extras"]
        record["team1_raid_points"]=new_match['teams']["team"][0]["stats"]["points"]["raid_points"]["total"]
        record["team1_tackle_points"]=new_match['teams']["team"][0]["stats"]["points"]["tackle_points"]["total"]
        
        record["team1_raids_done"]=new_match['teams']["team"][0]["stats"]["raids"]["total"]
        record["team1_successful_raids"]=new_match['teams']["team"][0]["stats"]["raids"]["successful"]
        record["team1_unsuccessful_raids"]=new_match['teams']["team"][0]["stats"]["raids"]["unsuccessful"]
        record["team1_empty_raids"]=new_match['teams']["team"][0]["stats"]["raids"]["Empty"]

        record["team1_tackles_done"]=new_match['teams']["team"][0]["stats"]["tackles"]["total"]
        record["team1_successful_tackles"]=new_match['teams']["team"][0]["stats"]["tackles"]["successful"]
        record["team1_unsuccessful_tackles"]=new_match['teams']["team"][0]["stats"]["tackles"]["unsuccessful"]
        
        record["team1_all_outs"]=new_match['teams']["team"][0]["stats"]["all_outs"]
        
        

        record["team2_name"]=new_match['teams']["team"][1]["name"]
        record["team2_id"]=new_match['teams']["team"][1]["id"]
        record["team2_score"]=new_match['teams']["team"][1]["score"]

        record["team2_all_out_points"]=new_match['teams']["team"][1]["stats"]["points"]["all_out"]
        record["team2_extra_points"]=new_match['teams']["team"][1]["stats"]["points"]["extras"]
        record["team2_raid_points"]=new_match['teams']["team"][1]["stats"]["points"]["raid_points"]["total"]
        record["team2_tackle_points"]=new_match['teams']["team"][1]["stats"]["points"]["tackle_points"]["total"]
        
        record["team2_raids_done"]=new_match['teams']["team"][1]["stats"]["raids"]["total"]
        record["team2_successful_raids"]=new_match['teams']["team"][1]["stats"]["raids"]["successful"]
        record["team2_unsuccessful_raids"]=new_match['teams']["team"][1]["stats"]["raids"]["unsuccessful"]
        record["team2_empty_raids"]=new_match['teams']["team"][1]["stats"]["raids"]["Empty"]

        record["team2_tackles_done"]=new_match['teams']["team"][1]["stats"]["tackles"]["total"]
        record["team2_successful_tackles"]=new_match['teams']["team"][1]["stats"]["tackles"]["successful"]
        record["team2_unsuccessful_tackles"]=new_match['teams']["team"][1]["stats"]["tackles"]["unsuccessful"]
        
        record["team2_all_outs"]=new_match['teams']["team"][1]["stats"]["all_outs"]
        
        
        if('win_by_coin_toss' in record):
            del record['win_by_coin_toss']

        if (record["result_code"] ==""):
            record["result_code"]="W"
        elif record["result_code"] in ["Tied","T"]:
            record["result_code"]="T"
            record["event_sub_status"]="Match Tied"
            record["winning_margin"]=0
        
        delete_column=["venue_gmt_offset", "event_livecoverage", "event_duration_left", "result_sub_code", "event_is_daynight",
                       "sport", "league_code", "event_state", "event_group", "event_islinkable", "tour_name", "event_status",
                       "event_status_id", "event_stage", "series_name", "participants", "end_date", "tour_id"]
        for column in delete_column:
            del record[column]
        
    data_frames2.append(pd.DataFrame.from_dict(file)) 
final_data_frame2=pd.concat(data_frames2, ignore_index=True)


team1_wins=final_data_frame2['team1_score']>final_data_frame2['team2_score']
team2_wins=final_data_frame2['team2_score']>final_data_frame2['team1_score']
final_data_frame2["winning_team_name"]="Match Tied"
final_data_frame2["winning_team_id"]=0
final_data_frame2.loc[team1_wins, 'winning_team_name']=final_data_frame2.loc[team1_wins, 'team1_name']
final_data_frame2.loc[team2_wins, 'winning_team_name']=final_data_frame2.loc[team2_wins, 'team2_name']
final_data_frame2.loc[team1_wins, 'winning_team_id']=final_data_frame2.loc[team1_wins, 'team1_id']
final_data_frame2.loc[team2_wins, 'winning_team_id']=final_data_frame2.loc[team2_wins, 'team2_id']
final_data_frame2.to_excel('match_data.xlsx', index=False)


venue name/home team name/team1 name/team 2 name/winning team name/ from hindi to english






