In [1]:
import os
import pandas as pd
import numpy as np
from scipy.stats import zscore
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data(file_path , filename):
    csv_path = os.path.join(file_path, filename)
    return pd.read_csv(csv_path)

In [3]:
def write_csv_data(file_path, filename, df):
    isExist = os.path.exists(file_path)
    if not isExist:
        os.makedirs(file_path)
        print("The new directory is created!")
    csv_path = os.path.join(file_path, filename)
    df.to_csv(csv_path)
    
    if os.path.exists(csv_path) and os.path.getsize(csv_path) > 0:
        print(filename + " was written to successfully!")

In [4]:
def remove_unnamed_cols(df_data):
    unnamed_cols = df_data.filter(regex='Unnamed').columns
    df_data.drop(columns=unnamed_cols, inplace=True)
    return df_data

In [5]:
print(os.getcwd())
player_file_path = "Mega Player Dataset"

player_data = load_data(player_file_path, "SuperCleaned.csv")

C:\Users\sheru\Documents\GitHub\t20-blocks


In [7]:
player_data = remove_unnamed_cols(player_data)
player_data

Unnamed: 0,ID,NAME,COUNTRY,Full name,Birthdate,Birthplace,Died,Date_of_death,Age,Major teams,...,BOWLING_T20s_Runs,BOWLING_T20s_Wkts,BOWLING_T20s_BBI,BOWLING_T20s_BBM,BOWLING_T20s_Ave,BOWLING_T20s_Econ,BOWLING_T20s_SR,BOWLING_T20s_4w,BOWLING_T20s_5w,BOWLING_T20s_10
0,8772,Henry Arkell,England,Henry John Denham Arkell,1898-06-26,"Edmonton, Middlesex",Dead,12/03/82,84.0,Northamptonshire,...,,,,,,,,,,
1,532565,Richard Nyren,England,Richard Nyren,1734-04-25,"Eartham, Sussex",Dead,1797-04-25,63.0,Hampshire XI,...,,,,,,,,,,
2,16856,Sydney Maartensz,England,Sydney Gratien Adair Maartensz,1882-04-14,"Colombo, Ceylon",Dead,10/09/67,85.0,Hampshire,...,,,,,,,,,,
3,16715,Brian Lander,England,Brian Richard Lander,09/01/42,"Bishop Auckland, Co Durham",Alive,,77.0,"['Durham,', 'Minor Counties']",...,,,,,,,,,,
4,15989,Derek Kenderdine,England,Derek Charles Kenderdine,1897-10-28,"Chislehurst, Kent",Dead,28/08/47,50.0,Royal Navy,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90303,19777,Keith Robinson,England,Keith Robinson,17/12/33,"Thirsk, Yorkshire",Alive,,86.0,Combined Services,...,,,,,,,,,,
90304,14843,Trevitt Hine-Haycock,England,Trevitt Reginald Hine-Haycock,1861-12-03,"Little Heath, Old Charlton, Kent",Dead,02/11/53,92.0,"['Kent,', 'Oxford University']",...,,,,,,,,,,
90305,15025,John Hughes,England,John Hughes,1825-07-02,Hertford,Dead,29/01/07,82.0,South of England,...,,,,,,,,,,
90306,11167,John Clayton,England,John Morton Clayton,1857-11-17,"Chesterfield, Derbyshire",Dead,01/04/38,81.0,Derbyshire,...,,,,,,,,,,


In [12]:
batting_file_path = "Outputs\Combined"

top_batting_df = load_data(batting_file_path, "combined_and_formatted_batting.csv")

In [15]:
top_batting_df = remove_unnamed_cols(top_batting_df)
top_batting_df

Unnamed: 0,Player,Mat,Inns,NO,Runs,Avg,BF,SR,100,50,4s,6s,Seasons
0,AB de Villiers,80,77,17,2592,42.34,1629,156.15,1,25,195,148,6
1,Aaron Finch,53,51,4,1180,24.10,881,136.74,0,9,114,49,5
2,Abdul Samad,25,20,4,226,12.18,159,118.49,0,0,12,14,3
3,Abhijeet Tomar,1,1,0,4,4.00,8,50.00,0,0,1,0,1
4,Abhinav Manohar,8,7,1,108,18.00,75,144.00,0,0,14,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,Yashasvi Jaiswal,23,23,0,547,21.34,406,124.03,0,3,62,22,3
334,Yusuf Pathan,55,45,20,804,33.07,608,122.20,0,4,65,31,4
335,Yuvraj Singh,34,31,3,651,22.39,504,123.48,0,3,65,29,4
336,Yuzvendra Chahal,88,16,9,35,2.22,84,36.73,0,0,0,0,6


In [17]:
matched_rows = player_data[player_data['NAME'].isin(top_batting_df['Player'])]
matched_rows

Unnamed: 0,ID,NAME,COUNTRY,Full name,Birthdate,Birthplace,Died,Date_of_death,Age,Major teams,...,BOWLING_T20s_Runs,BOWLING_T20s_Wkts,BOWLING_T20s_BBI,BOWLING_T20s_BBM,BOWLING_T20s_Ave,BOWLING_T20s_Econ,BOWLING_T20s_SR,BOWLING_T20s_4w,BOWLING_T20s_5w,BOWLING_T20s_10
614,43789,Zaheer Khan,Pakistan,Zaheer Khan,,,Unknown,,,Bahawalpur,...,,,,,,,,,,
615,43708,Zaheer Khan,Pakistan,Zaheer Ahmed Khan,03/12/79,"Lahore, Punjab",Alive,,40.0,"['Lahore,', 'Sargodha,', 'Water and Power Deve...",...,,,,,,,,,,
1035,24598,Eoin Morgan,England,Eoin Joseph Gerard Morgan,10/09/86,Dublin,Alive,,33.0,"['England,', 'Ireland,', 'Bangalore Royal Chal...",...,,,,,,,,,,
1227,17970,Daryl Mitchell,England,Daryl Keith Henry Mitchell,25/11/83,"Badsey, nr Evesham",Alive,,36.0,"['Mountaineers,', 'Worcestershire,', 'Worceste...",...,2443.0,85.0,May-28,May-28,28.74,7.80,22.1,1.0,1.0,0.0
1333,459257,Tymal Mills,England,Tymal Solomon Mills,12/08/92,"Dewsbury, Yorkshire",Alive,,27.0,"['England,', 'Auckland,', 'Brisbane Heat,', 'C...",...,2813.0,108.0,Apr-22,Apr-22,26.04,7.90,19.7,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90120,820691,Odean Smith,West Indies,Odean Fabian Smith,01/11/96,"St. Elizabeth, Jamaica",Alive,,23.0,"['West Indies,', 'Jamaica Tallawahs,', 'Jamaic...",...,290.0,7.0,Mar-20,Mar-20,41.42,10.05,24.7,0.0,0.0,0.0
90133,53118,Dwayne Smith,West Indies,Dwayne Romel Smith,12/04/83,"Storey Gap, Codrington Hill, St Michael, Barbados",Alive,,36.0,"['West Indies,', 'Barbados,', 'Barbados Triden...",...,2965.0,105.0,04-Aug,04-Aug,28.23,8.13,20.8,2.0,0.0,0.0
90189,53116,Lendl Simmons,West Indies,Lendl Mark Platter Simmons,25/01/85,"Port of Spain, Trinidad",Alive,,34.0,"['West Indies,', 'Brisbane Heat,', 'Guyana Ama...",...,547.0,22.0,Apr-19,Apr-19,24.86,8.72,17.0,1.0,0.0,0.0
90253,230554,Javon Searles,West Indies,Javon Philip Ramon Scantlebury-Searles,21/12/86,"Durants Village, St James, Barbados",Alive,,33.0,"['Barbados,', 'Kolkata Knight Riders,', 'Trinb...",...,625.0,26.0,04-May,04-May,24.03,8.68,16.6,2.0,0.0,0.0


In [23]:
cols = np.array(matched_rows.columns)
cols

array(['ID', 'NAME', 'COUNTRY', 'Full name', 'Birthdate', 'Birthplace',
       'Died', 'Date_of_death', 'Age', 'Major teams', 'Batting style',
       'Bowling style', 'Other', 'AWARDS', 'BATTING_Tests_Mat',
       'BATTING_Tests_Inns', 'BATTING_Tests_NO', 'BATTING_Tests_Runs',
       'BATTING_Tests_HS', 'BATTING_Tests_Ave', 'BATTING_Tests_BF',
       'BATTING_Tests_SR', 'BATTING_Tests_100', 'BATTING_Tests_50',
       'BATTING_Tests_4s', 'BATTING_Tests_6s', 'BATTING_Tests_Ct',
       'BATTING_Tests_St', 'BATTING_ODIs_Mat', 'BATTING_ODIs_Inns',
       'BATTING_ODIs_NO', 'BATTING_ODIs_Runs', 'BATTING_ODIs_HS',
       'BATTING_ODIs_Ave', 'BATTING_ODIs_BF', 'BATTING_ODIs_SR',
       'BATTING_ODIs_100', 'BATTING_ODIs_50', 'BATTING_ODIs_4s',
       'BATTING_ODIs_6s', 'BATTING_ODIs_Ct', 'BATTING_ODIs_St',
       'BATTING_T20Is_Mat', 'BATTING_T20Is_Inns', 'BATTING_T20Is_NO',
       'BATTING_T20Is_Runs', 'BATTING_T20Is_HS', 'BATTING_T20Is_Ave',
       'BATTING_T20Is_BF', 'BATTING_T20Is_SR', 'BAT

In [72]:
for i, value in enumerate(matched_rows['Major teams']):
    if isinstance(value, str):
        matched_rows.at[i, 'Major teams'] = [value]

In [81]:
def calc_chemistry(pA, pB):
    points = 0
    if pA['COUNTRY'] == pB['COUNTRY']:
        points+=5
    
    tList_A = pA['Major teams']
    tList_B = pB['Major teams']
    
    print(type(tList_A))
    for i in range(len(tList_A)):
        for j in range(len(tList_B)):
            if tList_A[i] == tList_B[j]:
                points += 2
    if (pA['Batting style'] is not None) and (pB['Batting style'] is not None) and (pA['Batting style'] == pB['Batting style']):
        points+=1
    return points

In [82]:
chem = calc_chemistry(matched_rows.iloc[0], matched_rows.iloc[1])
chem

<class 'str'>


59