In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
train_data.sample(20)

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs,Id
589,David Lee,67,16.9,5.1,2.0,3.4,59.6,0.0,0.0,0.0,1.1,1.8,57.7,1.6,2.9,4.5,0.6,0.5,0.3,0.8,1.0,870
77,Keith Askins,39,6.8,2.2,0.9,2.1,42.0,0.2,0.6,24.0,0.3,0.6,48.0,0.8,1.0,1.7,0.5,0.4,0.3,0.3,1.0,114
79,Richard Coffey,52,6.2,1.3,0.5,1.4,37.3,0.0,0.0,0.0,0.2,0.4,54.5,0.8,0.7,1.5,0.1,0.1,0.1,0.1,0.0,116
912,David Wesley,60,9.0,3.1,1.1,2.9,36.8,0.2,0.8,23.4,0.7,0.9,83.0,0.2,0.6,0.7,2.1,0.6,0.1,0.9,1.0,1300
404,Damian Lillard,82,38.6,19.0,6.7,15.7,42.9,2.3,6.1,36.8,3.3,3.9,84.4,0.5,2.6,3.1,6.5,0.9,0.2,3.0,1.0,585
889,Chris Webber,76,32.1,17.5,7.5,13.6,55.2,0.0,0.2,0.0,2.5,4.7,53.2,4.0,5.1,9.1,3.6,1.2,2.2,2.7,1.0,1270
857,Grant Hill,70,38.3,19.9,7.3,15.2,47.7,0.1,0.4,14.8,5.3,7.3,73.2,1.8,4.6,6.4,5.0,1.8,0.9,2.9,1.0,1228
2,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0,4
438,Chris Singleton,66,21.7,4.6,1.8,4.7,37.2,0.7,1.9,34.6,0.5,0.7,68.2,0.7,2.9,3.5,0.7,1.1,0.5,0.6,0.0,636
773,Roshown McLeod,34,10.2,4.8,1.8,4.8,38.0,0.0,0.3,10.0,1.1,1.3,82.2,0.4,1.1,1.5,0.4,0.1,0.0,0.7,0.0,1114


In [None]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         943 non-null    object 
 1   GP           943 non-null    int64  
 2   MIN          943 non-null    float64
 3   PTS          943 non-null    float64
 4   FGM          943 non-null    float64
 5   FGA          943 non-null    float64
 6   FG%          943 non-null    float64
 7   3P Made      943 non-null    float64
 8   3PA          943 non-null    float64
 9   3P%          937 non-null    float64
 10  FTM          943 non-null    float64
 11  FTA          943 non-null    float64
 12  FT%          943 non-null    float64
 13  OREB         943 non-null    float64
 14  DREB         943 non-null    float64
 15  REB          943 non-null    float64
 16  AST          943 non-null    float64
 17  STL          943 non-null    float64
 18  BLK          943 non-null    float64
 19  TOV     

In [None]:
train_data.isnull().sum()

Unnamed: 0,0
Name,0
GP,0
MIN,0
PTS,0
FGM,0
FGA,0
FG%,0
3P Made,0
3PA,0
3P%,6


In [None]:
X = train_data.drop(['Name', 'Id', 'TARGET_5Yrs'], axis=1)  # Features
y = train_data['TARGET_5Yrs']  # Target variable

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Print the shape of the processed data
print("Shape of Processed Data:", X_scaled.shape)

Shape of Processed Data: (943, 19)


In [None]:
# Initialize and train the logistic regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_scaled, y)

logistic_model.coef_


array([[ 0.58031008, -0.59865103,  0.13516286, -0.05633936,  0.22249597,
         0.18187449,  0.44650256, -0.40063327,  0.09527223,  0.19206321,
        -0.04554616,  0.1619579 ,  0.61518205, -0.24837912,  0.15259498,
         0.4698793 , -0.03411783,  0.25790957, -0.15067096]])

In [None]:
y_pred = logistic_model.predict(X_scaled)
accuracy = accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 71.47%


In [None]:
data_test = test_data.drop(['Name', 'Id'], axis=1)
data_test_imputed = imputer.transform(data_test)
data_test_scaled = scaler.transform(data_test_imputed)

In [None]:
logistic_model.predict(data_test_scaled)

array([0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0.,
       0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1.