### In this problem, we are given a dataset that contains goal scoring statistics from Cristiano Ronaldo's career during the period 1996-2016.
### Our aim is to find the relationship between different variables and the target and also to predict whether a given shot will be a GOAL or NOT.  

In [7]:
import pandas as pd
import numpy as np
import warnings
import random
warnings.simplefilter(action='ignore')
df=pd.read_csv(r'C:\Machine learning\other datasets\ronaldo_data.csv')

#X_cols=['location_x','location_y','power_of_shot','match_id','area_of_shot','game_season','distance_of_shot','remaining_min','lat/lng','shot_basics','range_of_shot','type_of_shot']
X_cols=['location_x','location_y','area_of_shot','distance_of_shot','remaining_min','shot_basics','range_of_shot','type_of_shot']
cols_to_encode=['shot_basics','range_of_shot','type_of_shot','area_of_shot']
cols_to_scale_min=['distance_of_shot']
cols_to_scale_sc=['location_x','location_y']

X=df.loc[:,X_cols]

#preprocessing

# filling up missing values in area_of_shot
X.loc[(X.shot_basics == 'Goal Area')|(X.shot_basics == 'Goal Line'),'area_of_shot']=\
X.loc[(X.shot_basics == 'Goal Area')|(X.shot_basics == 'Goal Line'),'area_of_shot'].fillna(value='Center(C)')

X.loc[X.shot_basics == 'Left Corner','area_of_shot']=\
X.loc[X.shot_basics == 'Left Corner','area_of_shot'].fillna(value='Left Side(L)')

X.loc[X.shot_basics == 'Right Corner','area_of_shot']=\
X.loc[X.shot_basics == 'Right Corner','area_of_shot'].fillna(value='Right Side(R)')

p_total=df.loc[df.shot_basics == 'Mid Range','area_of_shot'].value_counts().sum()
p=df.loc[df.shot_basics == 'Mid Range','area_of_shot'].value_counts().to_list()/p_total
X.loc[X.shot_basics == 'Mid Range','area_of_shot']=\
X.loc[X.shot_basics == 'Mid Range','area_of_shot'].apply(lambda x:x if type(x) == str else\
                                                          np.random.choice(['Center(C)','Left Side(L)','Right Side Center(RC)',\
                                                                               'Left Side Center(LC)','Right Side(R)'],1,p=p)[0])

X.loc[X.shot_basics == 'Mid Ground Line','area_of_shot']=\
X.loc[X.shot_basics == 'Mid Ground Line','area_of_shot'].fillna(value='Mid Ground(MG)')

p_total=df.loc[df.shot_basics == 'Penalty Spot','area_of_shot'].value_counts().sum()
p=df.loc[df.shot_basics == 'Penalty Spot','area_of_shot'].value_counts().to_list()/p_total
X.loc[X.shot_basics == 'Penalty Spot','area_of_shot']=\
X.loc[X.shot_basics == 'Penalty Spot','area_of_shot'].apply(lambda x: x if type(x) == str else\
                                                             np.random.choice(['Center(C)','Right Side Center(RC)',\
                                                                               'Left Side Center(LC)','Mid Ground(MG)'],1,p=p)[0])

# replacing empty values for location coordinates with 0
X['location_x'].fillna(value=0,inplace=True)
X['location_y'].fillna(value=0,inplace=True)

# filling type of shot and remaining minutes with respective mode values
#X['power_of_shot'].fillna(value=X['power_of_shot'].mean(),inplace=True)
X['type_of_shot'].fillna(value=X['type_of_shot'].mode()[0],inplace=True)
X['remaining_min'].fillna(value=X['remaining_min'].mode()[0],inplace=True)

# filling up missing values in distance_of_shot
p=0
p=X.loc[X.range_of_shot =='Less Than 8 ft.','distance_of_shot'].value_counts().head(8).sum()
p_val=X.loc[X.range_of_shot =='Less Than 8 ft.','distance_of_shot'].value_counts().head(8).to_list()/p
X.loc[X.range_of_shot =='Less Than 8 ft.','distance_of_shot']=\
X.loc[X.range_of_shot =='Less Than 8 ft.','distance_of_shot'].apply(lambda x:x if type(x) is int\
else np.random.choice([20,27,26,21,22,25,23,24],1,p=p_val)[0])

p=X.loc[X.range_of_shot =='8-16 ft.','distance_of_shot'].value_counts().head(8).sum()
p_val=X.loc[X.range_of_shot =='8-16 ft.','distance_of_shot'].value_counts().head(8).to_list()/p
X.loc[X.range_of_shot =='8-16 ft.','distance_of_shot']=\
X.loc[X.range_of_shot =='8-16 ft.','distance_of_shot'].apply(lambda x:x if type(x) is int\
else np.random.choice([35,34,33,32,30,31,28,29],1,p=p_val)[0])

p=X.loc[X.range_of_shot =='16-24 ft.','distance_of_shot'].value_counts().head(8).sum()
p_val=X.loc[X.range_of_shot =='16-24 ft.','distance_of_shot'].value_counts().head(8).to_list()/p
X.loc[X.range_of_shot =='16-24 ft.','distance_of_shot']=\
X.loc[X.range_of_shot =='16-24 ft.','distance_of_shot'].apply(lambda x:x if type(x) is int\
else np.random.choice([37,36,38,39,40,41,42,43],1,p=p_val)[0])

p=X.loc[X.range_of_shot =='24+ ft.','distance_of_shot'].value_counts().head(6).sum()
p_val=X.loc[X.range_of_shot =='24+ ft.','distance_of_shot'].value_counts().head(6).to_list()/p
X.loc[X.range_of_shot =='24+ ft.','distance_of_shot']=\
X.loc[X.range_of_shot =='24+ ft.','distance_of_shot'].apply(lambda x:x if type(x) is int\
else np.random.choice([45,44,46,47,43,48],1,p=p_val)[0])

X.loc[X.range_of_shot =='Back Court Shot','distance_of_shot']=\
X.loc[X.range_of_shot =='Back Court Shot','distance_of_shot'].fillna(random.choice([78,94,82,87,85]))

# conditional filling for range_of_shot
X.loc[(X.shot_basics == 'Goal Area') | (X.shot_basics == 'Goal Line'), 'range_of_shot'] =\
X.loc[(X.shot_basics == 'Goal Area') | (X.shot_basics == 'Goal Line'), 'range_of_shot'].fillna(value='Less Than 8 ft.')

X.loc[(X.shot_basics == 'Mid Range'), 'range_of_shot'] =\
X.loc[(X.shot_basics == 'Mid Range'), 'range_of_shot'].fillna(value=random.choice(['8-16 ft.','16-24 ft.']))

X.loc[(X.shot_basics == 'Penalty Spot')|(X.shot_basics == 'Left Corner')|(X.shot_basics == 'Right Corner'), 'range_of_shot'] =\
X.loc[(X.shot_basics == 'Penalty Spot')|(X.shot_basics == 'Left Corner')|(X.shot_basics == 'Right Corner'), 'range_of_shot'].\
fillna(value='24+ ft.')

X.loc[(X.shot_basics == 'Mid Ground Line'), 'range_of_shot'] =\
X.loc[(X.shot_basics == 'Mid Ground Line'), 'range_of_shot'].fillna(value='Back Court Shot')

# filling empty values for shot basics
X.loc[X.range_of_shot=='Less Than 8 ft.','shot_basics']=X.loc[X.range_of_shot=='Less Than 8 ft.','shot_basics']\
.fillna(value=random.choice(['Goal Area','Goal Line']))

X.loc[X.range_of_shot=='24+ ft.','shot_basics']=X.loc[X.range_of_shot=='24+ ft.','shot_basics']\
.fillna(value=random.choice(['Penalty Spot','Left Corner','Right Corner']))

X.loc[(X.range_of_shot=='8-16 ft.')|(X.range_of_shot=='16-24 ft.'),'shot_basics']\
=X.loc[(X.range_of_shot=='8-16 ft.')|(X.range_of_shot=='16-24 ft.'),'shot_basics']\
.fillna(value='Mid Range')

# using forward fill in case there are still any missing values.
X['shot_basics'].fillna(method='ffill',inplace=True)
X['range_of_shot'].fillna(method='ffill',inplace=True)
X['distance_of_shot'].fillna(method='ffill',inplace=True)
X['area_of_shot'].fillna(method='ffill',inplace=True)

# end of preprocessing


# setting target variable y and dropping rows where y has missing values
y=df.loc[:,'is_goal']
y=y.dropna()
dropped_rows=df[~df.index.isin(y.index)].values[:,0]
X=X.drop(dropped_rows)
X.reset_index(drop=True,inplace=True)

#X.isna().sum()

# encoding the categorical variables
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,StandardScaler
for i in cols_to_encode:
    le=LabelEncoder()
    X[i]=le.fit_transform(X[i])

# splitting of data to train and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=21)


# standardize all values in X before training
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)


#disc_features=[X.dtypes == int]
# from sklearn.feature_selection import mutual_info_classif
# def mutual_info(X,y):
#     mi=mutual_info_classif(X,y)
#     mi=pd.Series(mi,name='miscore',index=X.columns)
#     mi=mi.sort_values(ascending=False)
#     return mi
# mi_sc=mutual_info(X,y)
# print(mi_sc)


#from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#from sklearn.svm import SVC
#model=RandomForestClassifier(n_estimators=50,criterion='entropy',random_state=0)
#model=LogisticRegression(random_state=1)
model=DecisionTreeClassifier(criterion='entropy',splitter='best',min_samples_leaf=4,max_features=6,max_leaf_nodes=5)
#model=SVC(kernel='rbf',C=2,gamma=0.01,random_state=2)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

from sklearn.metrics import accuracy_score,confusion_matrix
print(confusion_matrix(y_test,y_pred),end='\n')
print(accuracy_score(y_test,y_pred))


[[2397  351]
 [1511  627]]
0.6189111747851003
