# Task for Today  

***

## Basketball Performance Prediction  

Given *data about basketball players*, let's try to predict the **length of career** for a given player.

We will use a logistic regression model to make our predictions. 

# Getting Started

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
data = pd.read_csv('../input/performance-prediction/summary.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop Name column
    df = df.drop('Name', axis=1)
    
    # Fill missing values
    df['3PointPercent'] = df['3PointPercent'].fillna(df['3PointPercent'].mean())
    
    # Split df into X and y
    y = df['Target']
    X = df.drop('Target', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

 # Training

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)

print("Test Accuracy: {:.2f}%".format(acc * 100))

# Clustering (k-means)

In [None]:
pca = PCA(n_components=2)
X_reduced = pd.DataFrame(pca.fit_transform(X_train), index=X_train.index, columns=["PC1", "PC2"])

kmeans = KMeans(n_clusters=4)
kmeans.fit(X_train)
clusters = pd.Series(kmeans.labels_, name="Cluster", index=X_train.index)
centroids = pca.transform(kmeans.cluster_centers_)

X_reduced = pd.concat([X_reduced, y_train, clusters], axis=1)
X_reduced

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(X_reduced['PC1'], X_reduced['PC2'], c=X_reduced['Cluster'])
plt.scatter(centroids[:, 0], centroids[:, 1], color='red', s=300)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("k-means Clustering")
plt.show()

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/p3uysBjzoeA