In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble, tree, linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import shuffle
data=pd.read_csv("../input/data.csv")

In [None]:
data.head()

In [None]:
#将shot_made_flag是null值的归类为测试集 
test=data[data['shot_made_flag'].isnull()]
test.shape

In [None]:
#shot_made_flag是notnull的归类为测试集 
train=data[data['shot_made_flag'].notnull()]
train.shape

In [None]:
#Checking for missing data
NAs = pd.concat([train.isnull().sum(), test.isnull().sum()], axis=1, keys=['Train', 'Test'])
NAs[NAs.sum(axis=1) > 0]

In [None]:
#各类别的数量
train.get_dtype_counts()

In [None]:
train.info()

In [None]:
#combined_shot_type VS shot_made_flag
train[['combined_shot_type', 'shot_made_flag']].groupby(['combined_shot_type'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)

In [None]:
#action_type  VS shot_made_flag
train[['action_type', 'shot_made_flag']].groupby(['action_type'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)

In [None]:
#赛季
train[['season', 'shot_made_flag']].groupby(['season'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)

In [None]:
#几分球
train[['shot_type', 'shot_made_flag']].groupby(['shot_type'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)

In [None]:
#投篮区域
train[['shot_zone_area', 'shot_made_flag']].groupby(['shot_zone_area'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)

In [None]:
train[['shot_zone_basic', 'shot_made_flag']].groupby(['shot_zone_basic'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)

In [None]:
#投篮距离
train[['shot_zone_range', 'shot_made_flag']].groupby(['shot_zone_range'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)

In [None]:
#主客场
train[['matchup', 'shot_made_flag']].groupby(['matchup'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)

In [None]:
#竞争对手
train[['opponent', 'shot_made_flag']].groupby(['opponent'], as_index=False).mean().sort_values(by='shot_made_flag', ascending=False)

In [None]:
#剔除team_name这两个无效字段
train = train.drop(['team_name','game_date'], axis=1)
test= test.drop(['team_name','game_date'], axis=1)

In [None]:
#各维度对shot made flag的相关度
train.head()

In [None]:
features = pd.concat([train, test], keys=['train', 'test'])

In [None]:
features['action_type'] = features['action_type'].astype(str)
features['combined_shot_type'] = features['combined_shot_type'].astype(str)
features['season'] = features['season'].astype(str)
features['shot_type'] = features['shot_type'].astype(str)
features['shot_zone_area'] = features['shot_zone_area'].astype(str)
features['shot_zone_basic'] = features['shot_zone_basic'].astype(str)
features['shot_zone_range'] = features['shot_zone_range'].astype(str)
#features['game_date'] = features['game_date'].astype(str)
features['matchup'] = features['matchup'].astype(str)
features['opponent'] = features['opponent'].astype(str)

In [None]:
# Prints R2 and RMSE scores
def get_score(prediction, lables):    
    print('R2: {}'.format(r2_score(prediction, lables)))
    print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, lables))))

# Shows scores for train and validation sets    
def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
    prediction_train = estimator.predict(x_trn)
    # Printing estimator
    print(estimator)
    # Printing train scores
    get_score(prediction_train, y_trn)
    prediction_test = estimator.predict(x_tst)
    # Printing test scores
    print("Test")
    get_score(prediction_test, y_tst)

In [None]:
train_labels = train.pop('shot_made_flag')

In [None]:
ax = sns.distplot(train_labels)

In [None]:
## Standardizing numeric features
numeric_features = features.loc[:,['lat', 'loc_x', 'loc_y', 'lon','minutes_remaining','seconds_remaining','shot_distance']]
numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()

In [None]:
ax = sns.pairplot(numeric_features_standardized)

In [None]:
# Getting Dummies from all other categorical vars
for col in features.dtypes[features.dtypes == 'object'].index:
    for_dummy = features.pop(col)
    features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [None]:
# Getting Dummies from all other categorical vars
for col in features.dtypes[features.dtypes == 'object'].index:
    for_dummy = features.pop(col)
    features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [None]:
features_standardized = features.copy()

In [None]:
### Replacing numeric features by standardized values
features_standardized.update(numeric_features_standardized)

In [None]:
### Splitting features
train_features = features.loc['train'].drop('shot_id', axis=1).select_dtypes(include=[np.number]).values
test_features = features.loc['test'].drop('shot_id', axis=1).select_dtypes(include=[np.number]).values

In [None]:
### Splitting standardized features
train_features_st = features_standardized.loc['train'].drop('shot_id', axis=1).select_dtypes(include=[np.number]).values
test_features_st = features_standardized.loc['test'].drop('shot_id', axis=1).select_dtypes(include=[np.number]).values

In [None]:
### Shuffling train sets
train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5)

In [None]:
### Splitting
x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)

In [None]:
ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st)
train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st)

In [None]:
# Average R2 score and standart deviation of 5-fold cross-validation
scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train, y_train)
train_test(GBest, x_train, x_test, y_train, y_test)

In [None]:
# Average R2 score and standart deviation of 5-fold cross-validation
scores = cross_val_score(GBest, train_features_st, train_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
# Retraining models
GB_model = GBest.fit(train_features, train_labels)
ENST_model = ENSTest.fit(train_features_st, train_labels)

In [None]:
## Getting our SalePrice estimation
Final_labels = ENST_model.predict(test_features_st)

In [None]:
## Saving to CSV
pd.DataFrame({'shot_id': test.Id, 'SalePrice': Final_labels}).to_csv('2017-03-27.csv', index =False)    