### Workflow Stages:
1. Goals
2. EDA
3. Models

### Goals:
1. Analyze data set.
2. Discovering knowledge about data set.
3. Build build a good quality classifier to shots selection

## 1. Analyze data set

In [None]:
# Loading Libraries

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings(action = 'ignore')

In [None]:
# Loading Data

pd.set_option('display.max_columns', None)
df = pd.read_csv('../input/kobe-bryant-shot-selection/data.csv.zip')

In [None]:
df.head(5)

In [None]:
# Size of the dataset

df.shape

In [None]:
# types of dataseries and missing variables
pd.DataFrame(data = {"featurs":df.dtypes.values,
                     "is null":df.isnull().sum().values},
             index = df.dtypes.index)

In [None]:
# Split dataset 

Shots_to_predict = df[df['shot_made_flag'].isnull()]
Shots_to_train = df[df['shot_made_flag'].notna()]

In [None]:
# Numeric features

Shots_to_train.describe()

In [None]:
# Categorical features

Shots_to_train.select_dtypes(['object']).describe()

In [None]:
Shots_to_predict.select_dtypes(['object']).describe()

In [None]:
# Distibution of made shots

Shots_to_train['shot_made_flag'].value_counts(normalize = True)

In [None]:
Shots_to_train.agg(['mean','median','skew'])

## 2. Discovering knowledge about data set

In [None]:
# Kobes's shots percentage for different opponents

f = sns.FacetGrid(Shots_to_train, size = 8)
f.map(sns.barplot,'shot_made_flag','opponent',ci = None, color = 'green', alpha = 0.2);

In [None]:
# Find out if playing in front of LA fans was essentials for Kobe's efficiency

Shots_to_train['matchup'] = Shots_to_train['matchup'].apply(lambda x: 'home' if x[:6] == 'LAL vs' else 'away')
g = sns.FacetGrid(Shots_to_train,row = 'shot_type', col = 'playoffs' ,size = 4)
g.map(sns.barplot,'matchup','shot_made_flag', ci =None, alpha = 0.3);

In [None]:
# Percentage of made shots for respective periods
period = sns.FacetGrid(Shots_to_train, size = 5)
period.map(sns.barplot, 'period','shot_made_flag',ci = None, color = [0.2,0.3,0.6]);

In [None]:
#Kobe's goals percentage depends on a position on the court
s = sns.FacetGrid(Shots_to_train, row = 'playoffs', size= 4)
s.map(sns.barplot, "shot_made_flag",'shot_zone_area',ci = None, color = [0.2,0.45,0.23]);

In [None]:
# Kobe's goals percentage over the years
seasons = sns.FacetGrid(Shots_to_train, size = 5)
seasons.map(sns.barplot, 'shot_made_flag','season', ci = None);

In [None]:
#Overview of each action type percentage
Shots_to_train[['action_type','shot_made_flag']].groupby('action_type').mean().assign(number_of_occurance = Shots_to_train['action_type'].value_counts()).sort_values(by = 'number_of_occurance',ascending = False)

In [None]:
# There are 6 more unique actions type in the Shots to train than to predict
# To remove excess of it:

Shots_to_train = Shots_to_train.loc[(Shots_to_train['action_type'].isin(Shots_to_predict['action_type'].unique())),:]
Shots_to_train.shape

### Conclusions for analysis:

1. To avoid overfitting I choose a few features which I believe have the most impact to shot success.
2. Action type seems to be a feature that strongly determines if action ends up with points. 
3. 2 pt field goal percentage is around 15 points higher than the 3pt field goal percentage

## 3. Build build a good quality classifier to shots selection

In [None]:
def create_dataset(in_data):
    
    in_data['time_remaining'] = (48 - in_data['period'].values*12 + in_data['minutes_remaining'].values)

    dataset = in_data[['shot_distance','minutes_remaining']]
    dataset = pd.concat([dataset, pd.get_dummies(in_data['action_type'],prefix = "_")],axis =1)
    dataset = pd.concat([dataset, pd.get_dummies(in_data['shot_type'], prefix = 'shot_type')],axis =1)
    dataset = pd.concat([dataset, pd.get_dummies(in_data['shot_zone_area'], prefix = 'zone')], axis =1)
    dataset= pd.concat([dataset, pd.get_dummies(in_data['playoffs'], prefix = 'offs')], axis =1)
    dataset = pd.concat([dataset,pd.get_dummies(in_data['period'], prefix = 'Q')], axis = 1)
    
    return dataset


In [None]:
X = create_dataset(Shots_to_train)
y = Shots_to_train[['shot_made_flag']]
X_sub = create_dataset(Shots_to_predict)
X_sub = X_sub.drop(columns="__Running Tip Shot", axis = 0)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3,random_state = 1,stratify = y)

In [None]:
# Logistic Regresion
Pipe_lr = Pipeline([["scaler",MinMaxScaler(feature_range=(0,1))],
                   ["lr",LogisticRegression()]])

lr_score = cross_val_score(estimator=Pipe_lr,
                        X = X_train,
                        y = y_train,
                         cv = 10)

lr_score = round(lr_score.mean(),ndigits=4)
lr_score_test = round(Pipe_lr.fit(X_train,y_train).score(X_test,y_test), ndigits=4)

In [None]:
# Decision Tree
tree = DecisionTreeClassifier()


tree_score = cross_val_score(estimator=tree,
                            X= X_train,
                            y = y_train,
                            cv = 10)

tree_score = round(tree_score.mean(),ndigits=4)
tree_score_test = round(tree.fit(X_train,y_train).score(X_test,y_test), ndigits=4)

In [None]:
#Random forest
forest = RandomForestClassifier()
forest_score = cross_val_score(estimator=forest,
                              X= X_train,
                              y = y_train,
                              cv = 10)

forest_score = round(forest_score.mean(),ndigits=4)
forest_score_test = round(forest.fit(X_train,y_train).score(X_test,y_test), ndigits=4)

In [None]:
#XGB
pipe_xgb = Pipeline([['scaler',MinMaxScaler(feature_range =(0,1))],
                                            ['xgb',xgb.XGBClassifier()]])

xgb_score = cross_val_score(estimator=pipe_xgb,
                           X = X_train,
                           y=y_train,
                           cv = 10)

xgb_score = round(xgb_score.mean(),ndigits=4)
xgb_score_test = round(pipe_xgb.fit(X_train,y_train).score(X_test,y_test), ndigits=4)

In [None]:
# voting Classfiier
clf1 = VotingClassifier(estimators=[('lr',Pipe_lr),('xgb',pipe_xgb)],voting='soft')
clf1_score = cross_val_score(estimator=clf1,
                            X = X_train,
                            y=y_train,
                            cv = 10)

clf1_score = round(clf1_score.mean(),ndigits=4)
clf1_score_test = round(clf1.fit(X_train,y_train).score(X_test,y_test), ndigits=4)

In [None]:
# Models results sum up
models = pd.DataFrame({'Model':['Logisti Regresion', 'Decision Tree','Random Forest','XGB','Clf1'],
                       'Cross Validation Score':[lr_score,tree_score,forest_score,xgb_score,clf1_score],
                       'Test set Score':[lr_score_test,tree_score_test,forest_score_test,xgb_score_test,clf1_score_test]
})
models.sort_values(by = 'Cross Validation Score',ascending = False)

In [None]:
## Hyperparameter optimization

# Logistic Regresion
from sklearn.model_selection import RandomizedSearchCV
random_param = {"lr__penalty":["l1","l2"],
                "lr__C":np.logspace(-4,4,50)
               }

model = Pipe_lr
random_search = RandomizedSearchCV(estimator = model,
                                   param_distributions=random_param,
                                   n_iter = 4*4*4*4,
                                   scoring = 'accuracy',
                                   cv = 10
                                  )

random_search.fit(X_train,y_train)
best_params = random_search.best_params_
print(best_params)

In [None]:
# Score were not improved
random_search.score(X_test,y_test)

In [None]:
# Create Submission

prob = Pipe_lr.predict_proba(X_sub)[:,1]
submission = new_sub = pd.DataFrame({'Shot_Id':Shots_to_predict['shot_id'],
                        'shot_made_flag':prob})

submission.to_csv("Submission_Kobe.csv",index = False)