In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
#df = pd.read_csv()
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

##finding available datasets

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#loading the dataset
df = pd.read_csv('/kaggle/input/ipl-complete-dataset-20082020/matches.csv')

#display first few rows
print(df.head())

In [None]:
#display all column names
list(df.columns)

In [None]:
#checking data types of columns
df.info()

In [None]:
#checking data values
df.describe()

In [None]:
df.describe(include='object')

In [None]:
#dataset dimension - no of rows, columns
df.shape

In [None]:
#finding how many missing values exist for each column
df.isnull().sum()

In [None]:
#percentage of missing values per columns
df.isna().sum()/len(df)*100

In [None]:
#dropping uneccessary columns 
df.drop(['match_type','target_overs','method'],axis=1,inplace=True)
df.head()

In [None]:
df.drop(['date','umpire1','umpire2','target_runs','result_margin'],axis=1,inplace=True)
df.head()

In [None]:
#encoding categorical values  
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

categorical_cols = ['toss_decision', 'city', 'player_of_match', 'venue', 
                    'team1', 'team2', 'toss_winner', 'winner', 'result', 'super_over']

for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])
    
df.head()

In [None]:
df.isnull().sum()

In [None]:
toss_win_impact = df.groupby('toss_winner')['winner'].value_counts(normalize=True).get(1,0)
df['toss_win_impact'] = df['toss_winner'].map(toss_win_impact)
df.head()

In [None]:
df.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer 

imputer_toss = SimpleImputer(strategy='most_frequent')
df['toss_winner'] = imputer_toss.fit_transform(df[['toss_winner']])

print(df['toss_winner'].isnull().sum())

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

df['toss_win_impact'] = label_encoder.fit_transform(df['toss_win_impact'])
df.head()

In [None]:
#spliting the dataset to features and trget variable
feature_cols = ['team1','team2','toss_winner','toss_decision','result']
X = df[feature_cols]
y = df.winner #target variable 

In [None]:
#scaling input variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X1 = scaler.fit_transform(X)

In [None]:
#splitting x and y into training and testing sets
from sklearn.model_selection import train_test_split
X1_train, X1_test, y_train, y_test = train_test_split(X1,y,test_size=0.25,random_state=0)
print('Whole Data Shape',df.shape)
print('X1 train shape',X1_train.shape)
print('x1 test shape',X1_test.shape)
#.25 - 25% testing
#.75 - 75% training

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X1_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best Cross-validation Accuracy: {grid_search.best_score_}")

In [None]:
#instantiate the model
#creating a logistic regression classifier object 
#fitting the model on the trai set
#perfoming predictions on pred 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100,random_state=42)
model.fit(X1_train,y_train)

y_pred = model.predict(X1_test)
print("Random Forest Accuracy:",accuracy_score(y_test,y_pred))

In [None]:
#displaying predicted values
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred, labels = logreg.classes_)

disp = ConfusionMatrixDisplay(cm,display_labels=logreg.classes_)
disp.plot()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import RocCurveDisplay
Logreg_roc = RocCurveDisplay.from_estimator(logreg,X1_test,y_test)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['team1'] = label_encoder.fit_transform(df['team1'])

team_mapping = dict(zip(label_encoder.classes_,label_encoder.transform(label_encoder.classes_)))
reverse_mapping = {v: k for k, v in team_mapping.items()}

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

team_wins = df['winner'].value_counts()
team_wins.index = team_wins.index.map(reverse_mapping)

plt.figure(figsize=(10,5))
sns.barplot(x=team_wins.index, y=team_wins.values)
plt.xticks(rotation=90)
plt.title('Win counts by team')
plt.xlabel('Teams')
plt.ylabel('Number of Wins')
plt.show()