In [None]:
# The imports...
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Data handling and analysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Models
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Import and Look at Data

In [None]:
df=pd.read_csv('/kaggle/input/performance-prediction/summary.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# Code in this section from https://www.kaggle.com/sachinsharma1123/kernel439a1a3a5b
df['3PointPercent']=df['3PointPercent'].fillna(df['3PointPercent'].mean())

In [None]:
# We do not need names, so we will drop the column.    
df=df.drop(['Name'],axis=1)

#  Developing the Model and Finding the Best Solution

In [None]:
# Split the data into targets and features 
y = df['Target']
X = df.drop(['Target'],axis=1)

In [None]:
# Use ANOVA to select best features
# Since the data is small enough, we will check many models and features to be comprehensive.

bestAcc = 0
numFeatures = 0
KNN_size = 0
logistic = False
SVM = False
tree = False
KNN = False
naive = False

# Check different models and their accuracies 
# Loop 1 through 10, for 1-10 amount of ANOVA features
for i in range(1,13):
        # Selection best i features        
        fvalue_selector = SelectKBest(f_classif, k=i)
        newX = fvalue_selector.fit_transform(X, y)
        
        X_train, X_test, y_train, y_test = train_test_split(newX, y, random_state=0, test_size=0.3)
    
        
        # Check Logistic Regression model
        logistic = LogisticRegression(max_iter = 10000)
        logistic.fit(X_train, y_train)
        prediction = logistic.predict(X_test)
        score = accuracy_score(y_test, prediction)
        if score > bestAcc:
            bestAcc = score
            logistic = True
            SVM = False
            tree = False
            KNN = False
            naive = False
            numFeatures = i
        
        # Check KNN model
        for j in range(1,10):
            knn = KNeighborsClassifier(n_neighbors = j)
            knn.fit(X_train, y_train)
            prediction = knn.predict(X_test)
            score = accuracy_score(y_test, prediction)
            if score > bestAcc:
                bestAcc = score
                numFeatures = i
                KNN_size = j
                logistic = False
                SVM = False
                tree = False
                KNN = True
                naive = False
                

        # Check Naive Bayes
        nb = GaussianNB()
        prediction = nb.fit(X_train, y_train).predict(X_test)
        score = accuracy_score(y_test, prediction)
        if score > bestAcc:
            bestAcc = score
            logistic = False
            SVM = False
            tree = False
            KNN = False
            naive = True
            numFeatures = i
        
        # Check SVM
        sv = svm.SVC()
        sv = sv.fit(X_train, y_train)
        prediction = sv.predict(X_test)
        score = accuracy_score(y_test, prediction)
        if score > bestAcc:
            bestAcc = score
            logistic = False
            SVM = True
            tree = False
            KNN = False
            naive = False
            numFeatures = i 
                
                
print("The best accuracy was", round(bestAcc, 5), ", using this many features:", numFeatures)


if logistic:
    print("Logistic was the best model.")
elif SVM:
    print("SVM was the best model.")
elif naive:
    print("Naive Bayes was the best model.")
else:
    print("KNN was the best model")

In [None]:
# See which features were important
# Code modified from https://stackoverflow.com/questions/39839112/the-easiest-way-for-getting-feature-names-after-running-selectkbest-in-scikit-le

selector = SelectKBest(f_classif, k=numFeatures)
selector.fit(X, y)
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
topFeatures = X.iloc[:,cols]

print("The top 8 most important features were as follows")
list(topFeatures.columns)

# Conclusion

First off, a huge **THANK YOU** for taking the time to read my notebook.

After checking various models with various features selected, I found that the ideal amount of features was 8 to predict if a player has been player for more than or less than 5 years. My model was able to predict with 0.7288 accuracy. 

The most important features were
* Games Played
* Minutes Played
* Field Goals Made
* Free Throws Made
* Free Throw Attemps
* Offensive Rebounds
* Rebounds

The first 2 make the most sense, as they would logically be the strongest correlation. I am not a basketball pro, so I do not know what field goals are, but free throws being correlated makes sense as many players would throw them, regardless of position, similar to the rebounds. I suspect if we had a feature of player positions, we could develop a model with higher accuracy as the stats likely vary heavily by position of the player. Ignoring positions groups the stats into one, making it more general.