In [1]:
import math , copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
nba = pd.read_csv('vised_zscaled_nba_stats_salaries_2017_2018.csv')

nba

Unnamed: 0,Player,Stats_from,Age,MPG,PPG,PER,TOPG,PFG,2017_2018 Salary
0,Quincy Acy,2016.0,-0.287304,-0.963811,-0.834424,0.026657,-1.081728,-0.262381,-0.885161
1,Steven Adams,2016.0,-1.014346,0.263455,-0.348183,0.194631,-0.337766,1.331825,1.737947
2,Arron Afflalo,2016.0,0.924431,1.240197,0.510925,-0.771221,-0.205984,0.126687,-0.806942
3,Alexis Ajinca,2016.0,0.197390,-0.994023,-0.699399,-0.162314,-0.506993,0.541693,-0.474272
4,Cole Aldrich,2016.0,0.197390,-1.143741,-0.787815,1.412445,-0.316834,0.611292,-0.178865
...,...,...,...,...,...,...,...,...,...
340,Joe Young,2016.0,-0.771999,-1.615213,-1.090642,-0.981189,-0.645615,-1.814225,-0.915249
341,Nick Young,2016.0,0.924431,-0.454942,-0.470467,-1.149164,-0.958740,-1.517006,-0.445188
342,Thaddeus Young,2016.0,0.197390,1.190077,0.916875,0.614567,0.683300,0.881372,0.768220
343,Cody Zeller,2016.0,-0.771999,0.159638,-0.208376,0.320612,-0.486582,1.342569,0.488747


In [3]:
nba.drop(columns=['Player', 'Stats_from'], axis=1, inplace=True)

nba

Unnamed: 0,Age,MPG,PPG,PER,TOPG,PFG,2017_2018 Salary
0,-0.287304,-0.963811,-0.834424,0.026657,-1.081728,-0.262381,-0.885161
1,-1.014346,0.263455,-0.348183,0.194631,-0.337766,1.331825,1.737947
2,0.924431,1.240197,0.510925,-0.771221,-0.205984,0.126687,-0.806942
3,0.197390,-0.994023,-0.699399,-0.162314,-0.506993,0.541693,-0.474272
4,0.197390,-1.143741,-0.787815,1.412445,-0.316834,0.611292,-0.178865
...,...,...,...,...,...,...,...
340,-0.771999,-1.615213,-1.090642,-0.981189,-0.645615,-1.814225,-0.915249
341,0.924431,-0.454942,-0.470467,-1.149164,-0.958740,-1.517006,-0.445188
342,0.197390,1.190077,0.916875,0.614567,0.683300,0.881372,0.768220
343,-0.771999,0.159638,-0.208376,0.320612,-0.486582,1.342569,0.488747


In [4]:
y_train = nba['2017_2018 Salary']

y_train.shape

(345,)

In [5]:
nba_X = nba.drop('2017_2018 Salary',axis = 1)

X_train = nba_X.to_numpy()

X_train.shape

(345, 6)

# Cost Function:

    J(w,b) = 1/2m( f_wb(x ^ i) - y ^ i) ^ 2  ;  i = 0..m - 1

    ve f_wb(x) = w * x[i] + b olmak üzere:

In [6]:
def compute_cost(X, y, w, b): 
    
    m = X.shape[0]
    cost = 0.0
    for i in range(m):                                
        f_wb_i = np.dot(X[i], w) + b           
        cost += (f_wb_i - y[i])**2     
    
    cost /= (2 * m)                      
    return cost

In [7]:
b_test = 0.2
w_test = np.array([0.1 , 0.2 , 0.22 , 0.5 , -0.025 , -0.01])
compute_cost(X_train,y_train,w_test,b_test)

0.2898014793280912

# Gradient Descent Algoritması:

yakınsama olana kadar tekrar et : {

    w_j = w_j - a * (dJ(w,b) / dw_j)  ; j = 0..n - 1 (n = feature sayısı , a = learning rate alpha)

    b = b - a * (dJ(w,b) / db)                     (w_j ve b aynı anda update edilecek(simultaneously))

}

ve 

    (dJ(w,b) / dw_j) = 1/m( f_wb(x ^ i) - y ^ i)x_j ^ i  ; i = 0..m - 1

    (dJ(w,b) / db) =  1/m( f_wb(x ^ i) - y ^ i)

In [8]:
# Gradient descent formüllerindeki türevli kısımları hesaplar

def compute_gradient_derivatives(X, y, w, b): 
   
    m,n = X.shape           
    dj_dw = np.zeros((n,))  # J nin w ye göre kısmi türevi
    dj_db = 0.              # J nin b ye göre kısmi türevi

    for i in range(m):                             
        
        error = (np.dot(X[i], w) + b) - y[i]  
        
        for j in range(n):                         
            
            dj_dw[j] = dj_dw[j] + error * X[i, j]    
            
        dj_db = dj_db + error  
        
    dj_dw = dj_dw / m
    
    dj_db = dj_db / m  
                                    
    return dj_db, dj_dw

In [9]:
compute_gradient_derivatives(X_train,y_train,w_test,b_test)

(0.19999999999999993,
 array([ 0.08080617, -0.05162175,  0.00049955,  0.17372699,  0.00215126,
        -0.00789069]))

In [10]:
# Gradient descent ile J yi minimize edecek w,b değerlerini bulur, iterasyon sayısını ve w,b geçmişini gösterir

def gradient_descent(X, y, w_in, b_in, cost_function, compute_gradient_derivatives, alpha, num_iters): 
    
    J_history = []
    w = copy.deepcopy(w_in)  #fonksiyondaki global w değişkenin değiştirmemek için
    b = b_in
    
    for i in range(num_iters):

        # Türevleri hesapla ve ata
        dj_db,dj_dw = compute_gradient_derivatives(X, y, w, b)

        # Gradient Descent Algoritması, eş zamanlı update ediliyor
        w = w - alpha * dj_dw             
        b = b - alpha * dj_db
      
        # Her iterasyon sonrası cost değerlerini kaydet
        if i<100000:      # sınırlama 
            J_history.append(cost_function(X, y, w, b))

        # yapılacak iterasyon sayısının 10'da 1'ine gelince J_history'nin son elemanını yazdır
        if i% math.ceil(num_iters / 10) == 0:  
            print(f"Iteration {i:}: Cost {J_history[-1]:}   ")
        
    return w, b, J_history

In [11]:
w_init = np.zeros(6)
b_init = 0
iterations = 1000
alpha = 0.01

w_final , b_final , J_hist = gradient_descent(
                X_train, y_train, w_init, b_init , compute_cost, compute_gradient_derivatives, alpha, iterations)

print(f"b,w found by gradient descent: {b_final},{w_final} ")

for i in range(10):
    print(f"prediction: {np.dot(X_train[i], w_final) + b_final}, target value: {y_train[i]}")

Iteration 0: Cost 0.4834028031299344   
Iteration 100: Cost 0.2498703343180957   
Iteration 200: Cost 0.24493996922060038   
Iteration 300: Cost 0.24261342491483115   
Iteration 400: Cost 0.2413145875892021   
Iteration 500: Cost 0.24053721665385489   
Iteration 600: Cost 0.24005626085250842   
Iteration 700: Cost 0.23975374027910432   
Iteration 800: Cost 0.23956151788779184   
Iteration 900: Cost 0.2394383346302813   
b,w found by gradient descent: 5.5335768173744554e-17,[ 0.01529997  0.28408847  0.34926467  0.22092324 -0.05261841 -0.00259832] 
prediction: -0.5061484898768139, target value: -0.8851608008511882
prediction: -0.004972407630803005, target value: 1.7379468038539274
prediction: 0.3850463165261524, target value: -0.8069422504090998
prediction: -0.5342350732074809, target value: -0.4742719130844986
prediction: -0.26993466784820846, target value: -0.1788646519326076
prediction: 1.1308720468609708, target value: 1.6102302078119914
prediction: -0.06206944698790311, target value

# Sonuç

    Data lineer regresyona pek uygun olmamasına rağmen devam etmek ve özelliklerin katsayılarını yorumlamak istedim. Farklı learning rate ler, farklı iterasyon sayıları denememe rağmen çok isabetli tahminler elde edemedim beklendiği üzere.

    Pozitif katsayılar büyükten küçüğe : PPG > MPG > PER > Age --> Buradan maaşlara en çok etkisi olan özelliklerin sırasıyla 'Maç başına atılan sayı' , 'Maç başına alınan süre' , 'Oyuncu verimlilik puanı' ve 'Yaş' olduğu söylenebilir.

    Negatif özelliklerde ise ; 'PFG' katsayısı 'TOPG' e kıyasla daha fazla olduğu için, maç başına yapılan faullerin maaşlara top kayıplarına göre daha çok etki ettiği söylenebilir.