## Importing Libraries

In [81]:
import requests
import io
import zipfile
import re
import pandas as pd
import numpy as np
import json

In [82]:
# Settings configurations

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [83]:
url = "https://cricsheet.org/downloads/recently_added_2_json.zip"
filetype = ".json"

In [87]:
response = requests.get(url)

if response.status_code == 200:
    content = response.content
    
    zip_file = zipfile.ZipFile(io.BytesIO(content))
    #zip_file.extract('README.txt')
    
    with zip_file.open('README.txt') as f:
        lines = [line.decode('utf-8') for line in f.readlines()]
        pattern = re.compile(r'(\d{4}-\d{2}-\d{2}) - ([^-]+) - ([^-]+) - (\w+) - (\d+) - (.+)')
        ids = [match.group(5) for line in lines if (match := pattern.match(line))]
    f.close()

In [99]:
for file in (ids[2:3]):
    #zip_file.extract(file+filetype)
    with zip_file.open(file+filetype) as jsonfile:
        data = json.load(jsonfile)
        # DataFrame to store - Metadata
        df_meta = pd.DataFrame([data["meta"]]).assign(filename=file, filetype=filetype)

        # DataFrame to store - match details
        df_info = pd.DataFrame([data["info"]])
        df_match = pd.concat([
            pd.json_normalize(df_info['event'], sep='_').rename(columns={'name': 'event'}),
            #pd.DataFrame.from_dict(df_info[['season', 'gender', 'city', 'venue', 'match_type', 'match_type_number', 'overs', 'team_type']].to_dict(orient='list')),
            pd.DataFrame(df_info[list(set(['season', 'gender', 'city', 'venue', 'match_type', 'match_type_number', 'overs', 'team_type']) & set(df_info.columns))]),
            df_info['dates'].apply(lambda x: [x[0], x[-1]]).apply(pd.Series).rename(columns={0: 'start_date', 1: 'end_date'}),
            df_info['teams'].apply(lambda x: [x[0], x[1]]).apply(pd.Series).rename(columns={0: 'team_host', 1: 'team_visitor'}),
            pd.json_normalize(df_info['toss'], sep='_').add_prefix('toss.'),
            pd.json_normalize(df_info['outcome'], sep='_').add_prefix('outcome.')
        ], axis=1).assign(id = file)
        df_match['player_of_match'] = df_info['player_of_match'].apply(lambda x: ','.join(x))

        # DataFrame to store - registry details
        df_registry = pd.DataFrame(list(data["info"]["registry"]["people"].items()), columns=['people', 'identifier']).assign(match_id = file)

        # DataFrame to store - match player details
        df_player = pd.json_normalize(df_info['players']).melt(var_name='team', value_name='player').explode('player').assign(match_id = file)

        # DataFrame to store - innings ball-by-ball details
        df_innings = pd.DataFrame([data["innings"]])
        df = pd.json_normalize(data['innings'], record_path=['overs', 'deliveries'], meta=['team',['overs', 'over']], sep='_').assign(match_id = file)

    #zip_file.close()

In [78]:
df = pd.json_normalize(data['innings']).explode('overs')['overs']
df2 = pd.json_normalize(df).explode('deliveries')
df3 = pd.json_normalize(df2['deliveries'],)
df3.head(50)

Unnamed: 0,batter,bowler,non_striker,runs.batter,runs.extras,runs.total,extras.wides,wickets,extras.legbyes,review.by,review.umpire,review.batter,review.decision,review.type,review.umpires_call,extras.noballs,extras.byes
0,EA Perry,M Strano,M Brown,4,0,4,,,,,,,,,,,
1,EA Perry,M Strano,M Brown,0,0,0,,,,,,,,,,,
2,EA Perry,M Strano,M Brown,1,0,1,,,,,,,,,,,
3,M Brown,M Strano,EA Perry,1,0,1,,,,,,,,,,,
4,EA Perry,M Strano,M Brown,2,0,2,,,,,,,,,,,
5,EA Perry,M Strano,M Brown,1,0,1,,,,,,,,,,,
6,EA Perry,S Ismail,M Brown,0,0,0,,,,,,,,,,,
7,EA Perry,S Ismail,M Brown,0,0,0,,,,,,,,,,,
8,EA Perry,S Ismail,M Brown,0,0,0,,,,,,,,,,,
9,EA Perry,S Ismail,M Brown,0,1,1,1.0,,,,,,,,,,


In [129]:
df = pd.json_normalize(data['innings'])['powerplays'].explode('powerplays')
#df = df.explode(df.columns.values.tolist())
df.head()

0    {'from': 0.1, 'to': 3.7, 'type': 'mandatory'}
1    {'from': 12.1, 'to': 13.6, 'type': 'batting'}
2    {'from': 0.1, 'to': 3.7, 'type': 'mandatory'}
3    {'from': 12.1, 'to': 13.6, 'type': 'batting'}
Name: powerplays, dtype: object

In [124]:
df.columns.values.tolist()

['team', 'powerplays', 'absent_hurt', 'target.overs', 'target.runs']

In [13]:
#url = "https://cricsheet.org/downloads/ipl_csv2.zip"
url = "https://cricsheet.org/downloads/recently_added_2_csv.zip"

# Send a GET request to the URL and retrieve the response
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Read the content of the response as bytes
    content = response.content

    # Create a file-like object from the bytes data
    zip_file = zipfile.ZipFile(io.BytesIO(content))

    # Extract and process CSV files matching the pattern
    csv_files = [file for file in zip_file.namelist() if file.endswith('.csv') and '_info' not in file]
    print("Here are the new csv files found :- \n")
    print(csv_files)
    
    for file in csv_files:
        try:
            # Extract the file from the zip archive
            zip_file.extract(file)

            # Read the extracted CSV file as a DataFrame
            df = pd.read_csv(file)

            # Perform your desired tasks with the data
            # ...

        except KeyError:
            print(f"File {file} not found in the zip archive.")

    # Close the zip file
    zip_file.close()

else:
    print("Failed to retrieve data from the URL.")

# Continue with your other tasks or operations
# ...


Here are the new csv files found :- 

['1384430.csv', '1384431.csv', '1387200.csv', '1387199.csv', '1391779.csv', '1391780.csv', '1405125.csv', '1384432.csv', '1405126.csv', '1391781.csv', '1384433.csv', '1387202.csv', '1387203.csv', '1387204.csv']


ParserError: Error tokenizing data. C error: Expected 3 fields in line 22, saw 4


In [12]:
df.head()

NameError: name 'df' is not defined

In [92]:
import pandas as pd

# Your original data
data = {
    'match_referees': [["JJ Crowe"]],
    'reserve_umpires': [["CB Gaffaney"]],
    'tv_umpires': [["HDPK Dharmasena"]],
    'umpires': [["CM Brown", "Nitin Menon"]]
}

# Create a DataFrame
df_umpire = pd.DataFrame(data)

Unnamed: 0,match_referees,reserve_umpires,tv_umpires,umpires
0,[JJ Crowe],[CB Gaffaney],[HDPK Dharmasena],"[CM Brown, Nitin Menon]"


In [94]:
import pandas as pd

# Your original data
data = {
    'match_referees': [["JJ Crowe"]],
    'reserve_umpires': [["CB Gaffaney"]],
    'tv_umpires': [["HDPK Dharmasena"]],
    'umpires': [["CM Brown", "Nitin Menon"]]
}

# Create a DataFrame
df_umpire = pd.DataFrame(data)


umpire_set = set()
for column in df_umpire.columns:
    umpire_set.update(df_umpire[column].explode().dropna())

df_umpire2 = pd.DataFrame(index=umpire_set, columns=df_umpire.columns[1:]).fillna(False)

for column in df_umpire.columns[1:]:
    df_umpire2[column] = df_umpire2.index.isin(df_umpire[column].explode().dropna()).reset_index().rename(columns={'index': 'name'})

df_umpire2.head()

AttributeError: 'numpy.ndarray' object has no attribute 'reset_index'

In [99]:
import pandas as pd

# Your original data
data = {
    'match_referees': [["JJ Crowe"]],
    'reserve_umpires': [["CB Gaffaney"]],
    'tv_umpires': [["HDPK Dharmasena"]],
    'umpires': [["CM Brown", "Nitin Menon"]]
}

# # Create a DataFrame
# df_umpire = pd.DataFrame(data)

# # Create a set of all unique names
# umpire_set = set()
# for column in df_umpire.columns:
#     umpire_set.update(df_umpire[column].explode().dropna())

# # Create a new DataFrame with the desired format
# df_umpire2 = pd.DataFrame(index=umpire_set, columns=df_umpire.columns[1:]).fillna(False)

# # Fill in the values based on the original DataFrame
# for column in df_umpire.columns[1:]:
#     df_umpire2[column] = df_umpire2.index.isin(df_umpire[column].explode().dropna())

# # Reset the index and add a new 'name' column
# df_umpire2 = df_umpire2.reset_index().rename(columns={'index': 'name'})

# # Display the result
# df_umpire2 = df_umpire2.reset_index(drop=True, inplace = True)
# df_umpire2.head()


In [102]:
del df_umpire
df_umpire = pd.DataFrame(data).reset_index(drop = True)
df_umpire.head()

Unnamed: 0,match_referees,reserve_umpires,tv_umpires,umpires
0,[JJ Crowe],[CB Gaffaney],[HDPK Dharmasena],"[CM Brown, Nitin Menon]"


In [110]:
import pandas as pd

# Your original data
data = {
    'match_referees': [["JJ Crowe"]],
    'reserve_umpires': [["CB Gaffaney"]],
    'tv_umpires': [["HDPK Dharmasena"]],
    'umpires': [["CM Brown", "Nitin Menon"]]
}

# Create a DataFrame
df_umpire = pd.DataFrame(data)

In [117]:
import pandas as pd

# Your original data
data = {
    'match_referees': [["JJ Crowe"]],
    'reserve_umpires': [["CB Gaffaney"]],
    'tv_umpires': [["HDPK Dharmasena"]],
    'umpires': [["CM Brown", "Nitin Menon"]]
}

# Create a DataFrame
df_umpire = pd.DataFrame(data)
df_umpire.head()

Unnamed: 0,match_referees,reserve_umpires,tv_umpires,umpires
0,[JJ Crowe],[CB Gaffaney],[HDPK Dharmasena],"[CM Brown, Nitin Menon]"


In [121]:
umpire_set = set()
for column in df_umpire.columns:
    umpire_set.update(df_umpire[column].explode().dropna())

df_umpire2 = pd.DataFrame(index=umpire_set, columns=df_umpire.columns[1:]).fillna(False)
for column in df_umpire.columns:
    df_umpire2[column] = df_umpire2.index.isin(df_umpire[column].explode().dropna())
df_umpire2 = df_umpire2.reset_index().rename(columns={'index': 'name'})
df_umpire2.head()

Unnamed: 0,name,reserve_umpires,tv_umpires,umpires,match_referees
0,HDPK Dharmasena,False,True,False,False
1,Nitin Menon,False,False,True,False
2,CM Brown,False,False,True,False
3,JJ Crowe,False,False,False,True
4,CB Gaffaney,True,False,False,False


In [119]:
df_umpire.columns

Index(['match_referees', 'reserve_umpires', 'tv_umpires', 'umpires'], dtype='object')

In [120]:
df_umpire.columns[1:]

Index(['reserve_umpires', 'tv_umpires', 'umpires'], dtype='object')

In [14]:
import pandas as pd
from pandas import json_normalize

# Your sample JSON data
data = {
    "innings": [
        {
            "team": "Nottinghamshire",
            "overs": [
                {
                    "over": 0,
                    "deliveries": [
                        {
                            "batter": "BT Slater",
                            "bowler": "BO Coad",
                            "non_striker": "BM Duckett",
                            "runs": {
                                "batter": 0,
                                "extras": 0,
                                "total": 0
                            }
                        },
                        {
                            "batter": "BT Slater",
                            "bowler": "BO Coad",
                            "non_striker": "BM Duckett",
                            "runs": {
                                "batter": 0,
                                "extras": 0,
                                "total": 0
                            }
                        }
                    ]
                }
            ]
        }
    ]
}

In [15]:
df = json_normalize(data['innings'], record_path=['overs', 'deliveries'],
                    meta=['team'],
                    sep='_')

# Add the 'overs.over' column separately
#df['overs_over'] = data['innings'][0]['overs'][0]['over']

df.head()

Unnamed: 0,batter,bowler,non_striker,runs_batter,runs_extras,runs_total,team
0,BT Slater,BO Coad,BM Duckett,0,0,0,Nottinghamshire
1,BT Slater,BO Coad,BM Duckett,0,0,0,Nottinghamshire


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   innings  1 non-null      object
dtypes: object(1)
memory usage: 136.0+ bytes
