In [52]:
import pandas as pd 
data = pd.read_csv('total_data_na.csv')
data.head()

Unnamed: 0,PLAYER,Mat.x,Inns.x,NO,Runs.x,HS,Avg.x,BF,SR.x,X100,...,Ov,Runs.y,Wkts,BBI,Avg.y,Econ,SR.y,X4w,X5w,y
0,Aaron Finch,10,9,1,134,46,16.75,100,134.0,0,...,0.0,0,0,0,0,0.0,0,0,0,0
1,AB de Villiers,12,11,2,480,90,53.33,275,174.54,0,...,0.0,0,0,0,0,0.0,0,0,0,0
2,Abhishek Sharma,3,3,2,63,46,63.0,33,190.9,0,...,0.0,0,0,0,0,0.0,0,0,0,0
3,Ajinkya Rahane,15,14,1,370,65,28.46,313,118.21,0,...,0.0,0,0,0,0,0.0,0,0,0,0
4,Alex Hales,6,6,0,148,45,24.66,118,125.42,0,...,0.0,0,0,0,0,0.0,0,0,0,0


In [53]:
def preprocess_column(column):
    # Remove non-numeric characters using regular expression
    column = column.str.replace(r'[^0-9.]', '')
    # Convert data type to float
    column = pd.to_numeric(column, errors='coerce')  # Set errors='coerce' to handle non-numeric values
    return column


In [54]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import numpy as np

# data['SR.x'] = preprocess_column(data['SR.x'])
# Select relevant features for clustering
features = data[['Runs.x', 'HS', 'Avg.x', 'BF', 'SR.x', 'X100', 'X50', 'Runs.y', 'Wkts', 'BBI', 'Avg.y', 'Econ', 'SR.y']]

features = features.replace('-', np.nan)
# Impute missing values with the mean of the column
imputer = SimpleImputer(strategy='mean')
features = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)


# Scale the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

print(features_scaled)

[[ 0.00943771  0.40323538 -0.03937894 ... -1.08126244 -1.3206071
  -1.1344344 ]
 [ 1.98807763  1.78437691  2.14671767 ... -1.08126244 -1.3206071
  -1.1344344 ]
 [-0.39658378  0.40323538  2.72461691 ... -1.08126244 -1.3206071
  -1.1344344 ]
 ...
 [-0.75685636 -1.04068531 -1.04039365 ... -0.04270068  0.36491249
   0.11820146]
 [-0.75685636 -1.04068531 -1.04039365 ...  0.53372595  2.31419659
  -0.23128002]
 [-0.75685636 -1.04068531 -1.04039365 ...  0.42191906  0.23624687
   0.82894469]]


In [55]:

kmeans = KMeans(n_clusters = 7, random_state=0, n_init= 1000)


kmeans.fit(features_scaled)

centroids = kmeans.cluster_centers_
print("Centroids:")
print(centroids)


labels = kmeans.labels_
print("\nLabels:")
print(labels)


Centroids:
[[ 2.12856926e-01  5.34174769e-01  3.46801745e-01  2.87231462e-01
   5.56035838e-01 -1.60046100e-01  5.53937194e-03 -8.81836435e-01
  -7.56989347e-01  0.00000000e+00 -1.01879256e+00 -1.19525003e+00
  -1.04636282e+00]
 [-7.25975857e-01 -9.65350321e-01 -9.44535284e-01 -7.37308902e-01
  -1.08904849e+00 -1.60046100e-01 -4.89541995e-01  1.35066055e+00
   1.60817827e+00  0.00000000e+00  1.78726656e-01  4.23991449e-01
   3.31934911e-01]
 [ 2.19130832e+00  1.57913734e+00  1.73081962e+00  2.20415868e+00
   7.59147856e-01 -1.60046100e-01  2.49617948e+00 -8.96203829e-01
  -7.86099102e-01  0.00000000e+00 -1.08126244e+00 -1.32060710e+00
  -1.13443440e+00]
 [ 2.57537514e-03  2.29023207e-01  2.25304080e-01 -4.90380239e-02
   9.41632659e-01 -1.60046100e-01 -3.16263517e-01  1.26739498e+00
   1.17517066e+00  0.00000000e+00  3.55828767e-01  4.67630538e-01
   5.06440044e-01]
 [ 2.40124739e+00  2.48279507e+00  1.59107980e+00  2.14174904e+00
   9.38847723e-01  5.56160197e+00  1.76307822e+00 -4.86

In [56]:
data['Cluster'] = labels
# Create the balanced cricket team
top_order_batsmen = data[data['Cluster'].isin([0, 1,2])]
middle_order_batsmen = data[data['Cluster'].isin([3,4,5])]
all_rounders = data[data['Cluster'].isin([1,4,5,6])]
bowlers = data[data['Cluster'].isin([5,6])]

# Select the top players based on the criteria for each role
top_order_batsmen = top_order_batsmen.nlargest(3, ['Runs.x'])
middle_order_batsmen = middle_order_batsmen.nlargest(2, ['Runs.x'])
all_rounders = all_rounders.nlargest(3, ['Runs.x', 'Wkts'])
bowlers = bowlers.nlargest(4, ['Wkts', 'BBI'])

# Create the balanced cricket team

balanced_cricket_team = pd.concat([top_order_batsmen, middle_order_batsmen, all_rounders, bowlers], ignore_index=True)


In [57]:
print("top_order_batsmen:", data.loc[top_order_batsmen.index])
print("middle_order_batsmen:", data.loc[middle_order_batsmen.index])
print("all_rounders:", data.loc[all_rounders.index])
print("bowlers:", data.loc[bowlers.index])

top_order_batsmen:               PLAYER  Mat.x  Inns.x  NO  Runs.x  HS  Avg.x   BF    SR.x  X100  \
42  Kane Williamson      17      17   3     735  84   52.5  516  142.44     0   
48      Lokesh Rahul     14      14   2     659  95  54.91  416  158.41     0   
40       Jos Buttler     13      13   3     548  95   54.8  353  155.24     0   

    ...  Runs.y  Wkts  BBI  Avg.y  Econ  SR.y  X4w  X5w  y Cluster  
42  ...       0     0    0      0   0.0     0    0    0  0       2  
48  ...       0     0    0      0   0.0     0    0    0  0       2  
40  ...       0     0    0      0   0.0     0    0    0  0       2  

[3 rows x 26 columns]
middle_order_batsmen:            PLAYER  Mat.x  Inns.x  NO  Runs.x   HS  Avg.x   BF    SR.x  X100  \
73   Rishabh Pant     14      14   1     684  128  52.61  394  173.60     1   
5   Ambati Rayudu     16      16   2     602  100     43  402  149.75     1   

    ...  Runs.y  Wkts  BBI  Avg.y  Econ  SR.y  X4w  X5w  y Cluster  
73  ...       0     0    0  

In [58]:

team_with_names = data.loc[balanced_cricket_team.index]
team_with_names


Unnamed: 0,PLAYER,Mat.x,Inns.x,NO,Runs.x,HS,Avg.x,BF,SR.x,X100,...,Runs.y,Wkts,BBI,Avg.y,Econ,SR.y,X4w,X5w,y,Cluster
0,Aaron Finch,10,9,1,134,46,16.75,100,134.0,0,...,0,0,0,0.0,0.0,0.0,0,0,0,0
1,AB de Villiers,12,11,2,480,90,53.33,275,174.54,0,...,0,0,0,0.0,0.0,0.0,0,0,0,2
2,Abhishek Sharma,3,3,2,63,46,63.0,33,190.9,0,...,0,0,0,0.0,0.0,0.0,0,0,0,0
3,Ajinkya Rahane,15,14,1,370,65,28.46,313,118.21,0,...,0,0,0,0.0,0.0,0.0,0,0,0,0
4,Alex Hales,6,6,0,148,45,24.66,118,125.42,0,...,0,0,0,0.0,0.0,0.0,0,0,0,0
5,Ambati Rayudu,16,16,2,602,100,43.0,402,149.75,1,...,0,0,0,0.0,0.0,0.0,0,0,0,4
6,Andre Russell,16,14,3,316,88,28.72,171,184.79,0,...,355,13,0,27.3,9.38,17.46,0,0,0,3
7,Andrew Tye,14,8,2,32,14,5.33,38,84.21,0,...,448,24,0,18.66,8.0,14.0,3,0,0,1
8,Axar Patel,9,8,2,80,19,13.33,69,115.94,0,...,218,3,0,72.66,8.38,52.0,0,0,0,6
9,Ben Cutting,9,6,2,96,37,24.0,58,165.51,0,...,168,2,0,84.0,9.88,51.0,0,0,0,6


In [59]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv("total_data_na.csv")
# data['SR.x'] = preprocess_column(data['SR.x'])
# Select relevant features for clustering
features = data[['Runs.x', 'HS', 'Avg.x', 'BF', 'SR.x', 'X100', 'X50', 'Runs.y', 'Wkts', 'BBI', 'Avg.y', 'Econ', 'SR.y']]

deatures = features.replace('-', np.nan, inplace=True)
# Impute missing values with the mean of the column
imputer = SimpleImputer(strategy='mean')
features = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)


# Scale the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Perform K-means clustering
kmeans = KMeans(n_clusters=7, random_state=42, n_init=10)
kmeans.fit(features_scaled)

# Assign cluster labels to the original dataset
data['Cluster'] = kmeans.labels_

# Create the balanced cricket team
top_order_batsmen = data[data['Cluster'].isin([0, 1])]
middle_order_batsmen = data[data['Cluster'].isin([2, 3])]
all_rounders = data[data['Cluster'].isin([4, 5])]
bowlers = data[data['Cluster'].isin([6, 7, 8, 9])]

# Select the top players based on the criteria for each role
top_order_batsmen = top_order_batsmen.nlargest(3, ['Runs.x'])
middle_order_batsmen = middle_order_batsmen.nlargest(2, ['Runs.x'])
all_rounders = all_rounders.nlargest(3, ['Runs.x', 'Wkts'])
bowlers = bowlers.nlargest(4, ['Wkts'])

# Create the balanced cricket team
print("Balanced Cricket Team:")

balanced_cricket_team = pd.concat([top_order_batsmen, middle_order_batsmen, all_rounders, bowlers], ignore_index=True)
print(balanced_cricket_team)

Balanced Cricket Team:
               PLAYER  Mat.x  Inns.x  NO  Runs.x   HS  Avg.x   BF    SR.x  \
0        Rishabh Pant     14      14   1     684  128  52.61  394  173.60   
1       Ambati Rayudu     16      16   2     602  100     43  402  149.75   
2        Shane Watson     15      15   1     555  117  39.64  359  154.59   
3    Kane Williamson      17      17   3     735   84   52.5  516  142.44   
4        Lokesh Rahul     14      14   2     659   95  54.91  416  158.41   
5      Ajinkya Rahane     15      14   1     370   65  28.46  313  118.21   
6       Robin Uthappa     16      16   0     351   54  21.93  265  132.45   
7          Nitish Ra0     15      15   2     304   59  23.38  232  131.03   
8         Shivam Mavi      9       4   1      13    7   4.33   15   86.66   
9         Tim Southee      8       4   2      52   36     26   46  113.04   
10  Washington Sundar      7       6   3      65   35  21.66   38  171.05   
11         Axar Patel      9       8   2      80   19

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deatures = features.replace('-', np.nan, inplace=True)


In [60]:
print("top_order_batsmen:", top_order_batsmen)
print("middle_order_batsmen:", middle_order_batsmen)
print("all_rounders:", all_rounders)
print("bowlers:", bowlers)

top_order_batsmen:            PLAYER  Mat.x  Inns.x  NO  Runs.x   HS  Avg.x   BF    SR.x  X100  \
73   Rishabh Pant     14      14   1     684  128  52.61  394  173.60     1   
5   Ambati Rayudu     16      16   2     602  100     43  402  149.75     1   
80   Shane Watson     15      15   1     555  117  39.64  359  154.59     2   

    ...  Runs.y  Wkts  BBI  Avg.y  Econ  SR.y  X4w  X5w  y Cluster  
73  ...       0     0    0      0  0.00     0    0    0  0       1  
5   ...       0     0    0      0  0.00     0    0    0  0       1  
80  ...     251     6    0  41.83  8.96    28    0    0  0       1  

[3 rows x 26 columns]
middle_order_batsmen:               PLAYER  Mat.x  Inns.x  NO  Runs.x  HS  Avg.x   BF    SR.x  X100  \
42  Kane Williamson      17      17   3     735  84   52.5  516  142.44     0   
48      Lokesh Rahul     14      14   2     659  95  54.91  416  158.41     0   

    ...  Runs.y  Wkts  BBI  Avg.y  Econ  SR.y  X4w  X5w  y Cluster  
42  ...       0     0    0    