In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import seaborn as sns
from sklearn.model_selection import train_test_split

## Kobe Bryant  shot selection

Cсылка на соревнование: https://www.kaggle.com/c/kobe-bryant-shot-selection

Goal: Fun and education

Using 20 years of data on Kobe's swishes and misses, can you predict which shots will find the bottom of the net? This competition is well suited for practicing classification basics, feature engineering, and time series analysis. Practice got Kobe an eight-figure contract and 5 championship rings. What will it get you?

This data contains the location and circumstances of every field goal attempted by Kobe Bryant took during his 20-year career. Your task is to predict whether the basket went in (shot_made_flag).

We have removed 5000 of the shot_made_flags (represented as missing values in the csv file). These are the test set shots for which you must submit a prediction. You are provided a sample submission file with the correct shot_ids needed for a valid prediction.

In [2]:
data = pd.read_csv('kobe.csv')

In [3]:
data.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [4]:
target = 'shot_made_flag'

**Задания:**

1. Провести анализ данных. Много хороших примеров анализа можно посмотреть здесь https://www.kaggle.com/c/kobe-bryant-shot-selection/kernels
2. Подготовить фичи для обучения модели - нагенерить признаков, обработать пропущенные значения, проверить на возможные выбросы, обработать категориальные признаки и др.
3. Обучить линейную модель, Lasso, Ridge (с встроеной регуляризацией) на тех же признаках - построить сравнительную таблицу или график коэффициентов, сделать заключения о том, как меняется величина коэффициентов, какие зануляются. 

**Дополнительно**
4. Сравнить результаты на тестовом наборе данных - сделать train_test_split в самом начале, подготовить переменные, сравнить результаты работы классификаторов (те же 3), метрика ROC AUC
5. Построить PCA на подготовленных признаках, посмотреть, какие компоненты составляют наибольшую часть дисперсии целевой переменной

Решение

Пункты 1 и 2 - анализирую, чищу и дорабатываю данные.

In [5]:
df = data[data.shot_made_flag.notna()] # уберем данные без целевых значений (флагов)
df.reset_index(inplace=True)
# теперь все значения не null
data.describe()

Unnamed: 0,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,seconds_remaining,shot_distance,shot_made_flag,team_id,shot_id
count,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,25697.0,30697.0,30697.0
mean,249.1908,24764070.0,33.953192,7.110499,91.107535,-118.26269,4.885624,2.519432,0.146562,28.365085,13.437437,0.446161,1610613000.0,15349.0
std,150.003712,7755175.0,0.087791,110.124578,87.791361,0.110125,3.449897,1.153665,0.353674,17.478949,9.374189,0.497103,0.0,8861.604943
min,2.0,20000010.0,33.2533,-250.0,-44.0,-118.5198,0.0,1.0,0.0,0.0,0.0,0.0,1610613000.0,1.0
25%,110.0,20500080.0,33.8843,-68.0,4.0,-118.3378,2.0,1.0,0.0,13.0,5.0,0.0,1610613000.0,7675.0
50%,253.0,20900350.0,33.9703,0.0,74.0,-118.2698,5.0,3.0,0.0,28.0,15.0,0.0,1610613000.0,15349.0
75%,368.0,29600470.0,34.0403,95.0,160.0,-118.1748,8.0,3.0,0.0,43.0,21.0,1.0,1610613000.0,23023.0
max,659.0,49900090.0,34.0883,248.0,791.0,-118.0218,11.0,7.0,1.0,59.0,79.0,1.0,1610613000.0,30697.0


In [6]:
df = data[data.shot_made_flag.notna()] # уберем данные без целевых значений (флагов)
df.reset_index(inplace=True)
# теперь все значения не null
df.describe()


Unnamed: 0,index,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,seconds_remaining,shot_distance,shot_made_flag,team_id,shot_id
count,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0,25697.0
mean,15327.166946,249.348679,24741090.0,33.953043,7.148422,91.257345,-118.262652,4.886796,2.5208,0.146243,28.311554,13.457096,0.446161,1610613000.0,15328.166946
std,8860.462397,149.77852,7738108.0,0.088152,110.073147,88.152106,0.110073,3.452475,1.151626,0.353356,17.523392,9.388725,0.497103,0.0,8860.462397
min,1.0,2.0,20000010.0,33.2533,-250.0,-44.0,-118.5198,0.0,1.0,0.0,0.0,0.0,0.0,1610613000.0,2.0
25%,7645.0,111.0,20500060.0,33.8843,-67.0,4.0,-118.3368,2.0,1.0,0.0,13.0,5.0,0.0,1610613000.0,7646.0
50%,15335.0,253.0,20900340.0,33.9703,0.0,74.0,-118.2698,5.0,3.0,0.0,28.0,15.0,0.0,1610613000.0,15336.0
75%,22975.0,367.0,29600270.0,34.0403,94.0,160.0,-118.1758,8.0,3.0,0.0,43.0,21.0,1.0,1610613000.0,22976.0
max,30696.0,653.0,49900090.0,34.0883,248.0,791.0,-118.0218,11.0,7.0,1.0,59.0,79.0,1.0,1610613000.0,30697.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25697 entries, 0 to 25696
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               25697 non-null  int64  
 1   action_type         25697 non-null  object 
 2   combined_shot_type  25697 non-null  object 
 3   game_event_id       25697 non-null  int64  
 4   game_id             25697 non-null  int64  
 5   lat                 25697 non-null  float64
 6   loc_x               25697 non-null  int64  
 7   loc_y               25697 non-null  int64  
 8   lon                 25697 non-null  float64
 9   minutes_remaining   25697 non-null  int64  
 10  period              25697 non-null  int64  
 11  playoffs            25697 non-null  int64  
 12  season              25697 non-null  object 
 13  seconds_remaining   25697 non-null  int64  
 14  shot_distance       25697 non-null  int64  
 15  shot_made_flag      25697 non-null  float64
 16  shot

In [8]:
# Генерация признаков (по смыслу)
# Кажется, самые важные параметры - расстояние до кольца, азимут на щит
# арктангенс loc_y/loc_x - новый столбец - azimuth

df['azimuth'] = np.arctan(df.loc_y/df.loc_x)
df.loc[df['azimuth'].isna(), 'azimuth'] = np.pi 
df.azimuth.isna().any() # проверяю, что все значения не NaN

# Оставшиеся минуты и секунды нужно объединить в переменную "оставшееся время" - time_remaining
df['time_remaining'] = df.minutes_remaining * 60 + df.seconds_remaining

# слишком много выходит - не так понял исходные значения Широту и долготу нужно объединить - получим координаты города/стадиона - их нужно использовать как один категориальный признак - point
# df['point'] = df.lon.astype(str) + " " + df.lat.astype(str) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [9]:
target_col = 'shot_made_flag'
useless_cols = ['index']
df.drop(labels = useless_cols, axis=1, inplace=True)
linear_columns = ['game_event_id', 'game_id', 'shot_id'] 
num_cat_cols = ['period', 'playoffs', 'team_id']
cat_feat = list(df.dtypes[df.dtypes == object].index)
cols_to_norm = [c for c in df.columns if c not in cat_feat + num_cat_cols + [target_col]]
print(cols_to_norm)
# scaler = StandardScaler()
# X = pd.DataFrame(scaler.fit_transform(X[cols_to_norm]),columns=cols_to_norm)

['game_event_id', 'game_id', 'lat', 'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'seconds_remaining', 'shot_distance', 'shot_id', 'azimuth', 'time_remaining']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [10]:
# неудачная попытка анализа данных тут
# sns.pairplot(df, hue="shot_made_flag"
#  , diag_kws={'bw': 0.2}
# ) # работает больше десяти минут, фикс с bandwidth не помог, нет времени разбираться

In [11]:
X = df.drop(labels=['shot_made_flag'], axis=1)
y = df['shot_made_flag']

# Создаем дамми-переменные для категорий
X_dummies = pd.get_dummies(X[cat_feat + num_cat_cols], columns=cat_feat + num_cat_cols)
# dummy_test = pd.get_dummies(X_test[cat_feat], columns=cat_feat)
X_dummies = X_dummies[list(set(X_dummies))]
X = pd.concat([X[cols_to_norm],
                     X_dummies], axis=1)

In [23]:
from sklearn.linear_model import LinearRegression, RidgeClassifierCV, Lasso, LassoCV, RidgeClassifier


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [34]:
X_train[cols_to_norm]

Unnamed: 0,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,seconds_remaining,shot_distance,shot_id,azimuth,time_remaining
1119,41,20000936,33.9303,132,114,-118.1378,6,57,17,1335,0.712358,417
14464,207,21000610,33.9623,-2,82,-118.2718,3,41,8,17257,-1.546411,221
8725,230,20600598,33.8913,133,153,-118.1368,11,47,20,10437,0.855215,707
20743,414,29800452,34.0443,0,0,-118.2698,0,18,0,24767,3.141593,18
24844,489,41000231,33.8093,102,235,-118.1678,0,2,25,29675,1.161292,2
...,...,...,...,...,...,...,...,...,...,...,...,...
21575,466,29900821,34.0713,158,-27,-118.1118,1,24,16,25768,-0.169251,84
5390,131,20400145,33.5163,-128,528,-118.3978,0,0,54,6419,-1.332960,0
860,319,20000666,33.8733,132,171,-118.1378,3,14,21,1032,0.913407,194
15795,10,21100479,34.0793,54,-35,-118.2158,10,35,6,18816,-0.575072,635


In [37]:
lin = LinearRegression(n_jobs = -1)
lin.fit(X_train, y_train)   
lin.score(X_test, y_test)

-0.9172903287163581

In [27]:
rc = RidgeClassifierCV()
rc.fit(X_train, y_train)   
rc.score(X_test, y_test)

0.6095979247730221

In [28]:
rc_ = RidgeClassifier()
rc_.fit(X_train, y_train)   
rc_.score(X_test, y_test)

  overwrite_a=True).T


0.6531069449357387

In [29]:
lasso = LassoCV()
lasso.fit(X_train, y_train)   
lasso.score(X_test, y_test)

4.292096248803556e-05

In [30]:
lasso_ = Lasso()
lasso_.fit(X_train, y_train)   
lasso_.score(X_test, y_test)

0.01887722013533999