In [64]:
import numpy as np
import pandas as pd
import math

#import import_ipynb
import nbimporter
from Scoring_Function import score

# Read csv into pandas dataframe
df = pd.read_csv('train_fixed.csv', low_memory=False)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
plays = pd.unique(df['PlayId'])
plays

array([20170907000118, 20170907000139, 20170907000189, ...,
       20181230154082, 20181230154135, 20181230154157])

In [3]:
A_id = 20170907000118
B_id = 20170907000139

In [4]:
def calc_similar(A_id, B_id):
    A = df[df['PlayId'] == 20170907000118]
    B = df[df['PlayId'] == 20170907000139]
    
    # Set up Play A for matching
    players_A = pd.DataFrame(columns = A.columns)
    count_A = 0
    rush_index_A = 0
    
    for i in range(len(A)):
        count_A += 1
        if A.iloc[i]['NflId'] == A.iloc[i]['NflIdRusher']:
            rush_index_A = count_A
            rusher_A = pd.DataFrame(columns = A.columns)
            rusher_A = rusher_A.append(A.iloc[i])
        else:
            players_A = players_A.append(A.iloc[i])
            
    if rush_index_A < 12:
        offense_A = players_A[:10]
        defense_A = players_A[10:]
    else:
        offense_A = players_A[11:]
        defense_A = players_A[:11]
        
    offense_A = offense_A[offense_A['Position'].isin(['WR', 'TE', 'RB', 'HB'])]
    
    # Set up Play B for matching
    players_B = pd.DataFrame(columns = B.columns)
    count_B = 0
    rush_index_B = 0
    
    for i in range(len(B)):
        count_B += 1
        if B.iloc[i]['NflId'] == B.iloc[i]['NflIdRusher']:
            rush_index_B = count_B
            rusher_B = pd.DataFrame(columns = B.columns)
            rusher_B = rusher_B.append(B.iloc[i])
        else:
            players_B = players_B.append(B.iloc[i])
            
    if rush_index_B < 12:
        offense_B = players_B.iloc[:10]
        defense_B = players_B.iloc[10:]
    else:
        offense_B = players_B.iloc[11:]
        defense_B = players_B.iloc[:11]
        
    offense_B = offense_B[offense_B['Position'].isin(['WR', 'TE', 'RB', 'HB'])]
    
    # Check if offensive personnel amount is matching
    if len(offense_A) != len(offense_B):
        return 1000 # Return large error
    
    # Find players' positions relative to the rusher for Play A
    if offense_A['PlayDirection'].iloc[0] == 'left':
        offense_A['X_scaled'] = offense_A['X'] - rusher_A['X'].iloc[0]
        defense_A['X_scaled'] = defense_A['X'] - rusher_A['X'].iloc[0]
    else:
        offense_A['X_scaled'] = rusher_A['X'].iloc[0] - offense_A['X']
        defense_A['X_scaled'] = rusher_A['X'].iloc[0] - defense_A['X']
        
    offense_A['Y_scaled'] = rusher_A['Y'].iloc[0] - offense_A['Y']
    defense_A['Y_scaled'] = rusher_A['Y'].iloc[0] - defense_A['Y']
    
    # Find players' positions relative to the rusher for Play B
    if offense_B['PlayDirection'].iloc[0] == 'left':
        offense_B['X_scaled'] = offense_B['X'] - rusher_B['X'].iloc[0]
        defense_B['X_scaled'] = defense_B['X'] - rusher_B['X'].iloc[0]
    else:
        offense_B['X_scaled'] = rusher_B['X'].iloc[0] - offense_B['X']
        defense_B['X_scaled'] = rusher_B['X'].iloc[0] - defense_B['X']
        
    offense_B['Y_scaled'] = rusher_B['Y'].iloc[0] - offense_B['Y']
    defense_B['Y_scaled'] = rusher_B['Y'].iloc[0] - defense_B['Y']
    
    
    # Initialize error counter
    error = 0
    # Match up and calculate error for offense
    while len(offense_A) > 0:
        curr = offense_A.iloc[0]
        play_error = 1000
        play_index = None
        for i in range(len(offense_B)):
            dist = math.sqrt((curr['X_scaled'] - offense_B['X_scaled'].iloc[i])**2 + 
                             (curr['Y_scaled'] - offense_B['Y_scaled'].iloc[i])**2)
            if dist < play_error:
                play_error = dist
                play_index = i
            
        error += play_error
        offense_B = offense_B.drop(offense_B.index[play_index])
        
        offense_A = offense_A.drop(offense_A.index[0])
    
    return error
    

In [5]:
calc_similar(A_id, B_id)

26.140444491203255

In [65]:
df = df[:220]

In [70]:
plays_yards = df[['PlayId', 'Yards']].drop_duplicates()

from sklearn.model_selection import train_test_split
train , test = train_test_split(plays_yards, test_size = 0.3)

x_train = train.drop(['Yards'], axis=1)
y_train = train['Yards']

x_test = test.drop(['Yards'], axis = 1)
y_test = test['Yards']

In [71]:
from sklearn import neighbors
from sklearn.metrics import mean_squared_error 

from sklearn.model_selection import GridSearchCV
params = {'n_neighbors': [*range(1, 2, 1)]}

knn = neighbors.KNeighborsRegressor(metric=calc_similar)

model = GridSearchCV(knn, params, cv=3)
model.fit(x_train,y_train)
model.best_params_

{'n_neighbors': 1}

In [72]:
model = neighbors.KNeighborsRegressor(n_neighbors = model.best_params_['n_neighbors'])
model.fit(x_train, y_train)  #fit the model
pred=model.predict(x_test) #make prediction on test set
pred_round = np.around(pred)
pred_round

array([9., 7., 8.])

In [73]:
sample_sub = pd.read_csv('sample_submission.csv').rename(columns={"Unnamed: 0": "PlayId"}) #bring in sample sumbission
sub = sample_sub.set_index('PlayId').drop([0])
plays = pd.unique(test['PlayId'])

for i in range(plays.size):
    dist = np.histogram(pred_round[i], bins = 199, range = (-99, 99))[0] #find probability for each bin
    cumul_sum = np.cumsum(dist)
    sub.loc[plays[i]] = cumul_sum
sub = sub.reset_index()

In [74]:
score(sub, df)

0.031825795644891124