In [1]:
import pandas as pd
import numpy as np
import scipy.stats as spst
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
import glob

## 1. 활용할 데이터셋 생성
### 1-1. EDA

In [4]:
# 엑셀 파일 경로
file_paths = glob.glob("year/*.csv")  # 실제 경로로 바꿔주세요

# 중복된 야구 선수 이름을 저장할 딕셔너리
duplicate_players = {}

# 모든 엑셀 파일을 순회하면서 중복된 선수 정보 찾기
for file_path in file_paths:
    df = pd.read_csv(file_path)
    
    # '선수 이름', 'pos', 'Injury' 열 추출
    players = df[['Name', 'Pos', 'Injury / Surgery']]
    
    # 중복된 선수 이름 찾기
    duplicate_names = players[players.duplicated(subset='Name', keep=False)]
    
    # 중복된 선수 정보를 duplicate_players 딕셔너리에 추가
    for _, row in duplicate_names.iterrows():
        player_name = row['Name']
        pos = row['Pos']
        injury = row['Injury / Surgery']
        
        if player_name in duplicate_players:
            duplicate_players[player_name]['pos'].append(pos)
            duplicate_players[player_name]['Injury'].append(injury)
        else:
            duplicate_players[player_name] = {'pos': [pos], 'Injury': [injury]}

# 중복된 야구 선수 이름과 정보를 담은 데이터프레임 생성
player_data = []
for player_name, player_info in duplicate_players.items():
    pos_list = player_info['pos']
    injury_list = player_info['Injury']
    
    player_data.append({'Name': player_name, 'Pos': pos_list, 'Injury': injury_list})

df_duplicate_players = pd.DataFrame(player_data)

# 결과 출력
print(df_duplicate_players)


                Name               Pos  \
0       Luis Rengifo        [INF, INF]   
1        Chris Davis          [1B, 1B]   
2        Dillon Tate          [RP, RP]   
3       Carlos Rodón          [SP, SP]   
4        Jorge Soler  [OF, OF, OF, OF]   
..               ...               ...   
344  Vince Velasquez          [SP, SP]   
345    Lars Nootbaar          [OF, OF]   
346        Joey Bart            [C, C]   
347     Joc Pederson          [OF, OF]   
348        Alex Wood          [SP, SP]   

                                                Injury  
0             [Strained hamstring, Strained hamstring]  
1           [Patellar tendinitis, Patellar tendinitis]  
2                 [Forearm contusion, Sprained finger]  
3              [Tommy John surgery, Shoulder soreness]  
4    [Strained oblique, Strained oblique, Bilateral...  
..                                                 ...  
344             [Elbow inflammation, Elbow discomfort]  
345            [Thumb contusion, Lower 

In [7]:
list(df_duplicate_players)

['Name', 'Pos', 'Injury']

In [9]:
df_duplicate_players['Pos'].value_counts().head(10)

[RP, RP]            85
[SP, SP]            66
[OF, OF]            52
[C, C]              14
[INF, INF]           9
[SP, SP, SP]         9
[INF/OF, INF/OF]     9
[OF, OF, OF]         9
[OF, OF, OF, OF]     8
[SP, SP, SP, SP]     8
Name: Pos, dtype: int64

In [11]:
import pandas as pd
import glob

# 엑셀 파일 경로
file_paths = glob.glob("year/*.csv")  # 실제 경로로 바꿔주세요

# 중복된 야구 선수 이름을 저장할 딕셔너리
duplicate_players = {}

# 모든 엑셀 파일을 순회하면서 중복된 선수 정보 찾기
for file_path in file_paths:
    df = pd.read_csv(file_path)
    
    # '선수 이름', 'pos', 'Injury' 열 추출
    players = df[['Name', 'Pos', 'Injury / Surgery']]
    
    # 중복된 선수 이름 찾기
    duplicate_names = players[players.duplicated(subset='Name', keep=False)]
    
    # 중복된 선수 정보를 duplicate_players 딕셔너리에 추가
    for _, row in duplicate_names.iterrows():
        player_name = row['Name']
        pos = row['Pos']
        injury = row['Injury / Surgery']
        
        if player_name in duplicate_players:
            duplicate_players[player_name]['pos'].append(pos)
            duplicate_players[player_name]['Injury'].append(injury)
        else:
            duplicate_players[player_name] = {'pos': [pos], 'Injury': [injury]}

# 중복된 야구 선수 이름과 정보를 담은 데이터프레임 생성
player_data = []
for player_name, player_info in duplicate_players.items():
    pos_list = player_info['pos']
    injury_list = player_info['Injury']
    
    # 투수 부상 정보만 추출하기 위해 조건 추가
    if (
        (pos_list == ['RP', 'RP']) or
        (pos_list == ['SP', 'SP']) or
        (pos_list == ['RP', 'RP', 'RP']) or
        (pos_list == ['SP', 'SP', 'SP']) or
        (pos_list == ['SP', 'SP', 'SP', 'SP']) or
        (pos_list == ['RP', 'RP', 'RP', 'RP']) or
        (pos_list == ['SP', 'SP', 'SP', 'SP', 'SP']) or
        (pos_list == ['SP', 'SP', 'RP', 'RP']) or
        (pos_list == ['RP', 'RP', 'SP', 'SP']) or
        (pos_list == ['SP/RP', 'SP/RP']) or
        (pos_list == ['RP', 'RP', 'RP', 'RP', 'RP']) or
        (pos_list == ['RP/SP', 'RP/SP']) or
        (pos_list == ['SP', 'SP', 'SP', 'SP', 'SP', 'SP']) or
        (pos_list == ['SP', 'SP', 'SP', 'SP', 'SP', 'SP', 'SP'])
    ):
        player_data.append({'Name': player_name, 'Pos': pos_list, 'Injury': injury_list})

df_duplicate_players = pd.DataFrame(player_data)

# 결과 출력
print(df_duplicate_players)

                Name               Pos  \
0        Dillon Tate          [RP, RP]   
1       Carlos Rodón          [SP, SP]   
2       Zack Littell          [RP, RP]   
3     Yonny Chirinos          [SP, SP]   
4    Joely Rodríguez  [RP, RP, RP, RP]   
..               ...               ...   
186   Brandon Hughes          [RP, RP]   
187  Andrew Bellatti          [RP, RP]   
188   Germán Márquez          [SP, SP]   
189  Vince Velasquez          [SP, SP]   
190        Alex Wood          [SP, SP]   

                                                Injury  
0                 [Forearm contusion, Sprained finger]  
1              [Tommy John surgery, Shoulder soreness]  
2             [Strained hamstring, Elbow inflammation]  
3           [Triceps inflammation, Tommy John surgery]  
4    [Strained lat, Strained hamstring, Strained ob...  
..                                                 ...  
186             [Knee inflammation, Knee inflammation]  
187            [Strained shoulder, Tric

In [12]:
df_duplicate_players

Unnamed: 0,Name,Pos,Injury
0,Dillon Tate,"[RP, RP]","[Forearm contusion, Sprained finger]"
1,Carlos Rodón,"[SP, SP]","[Tommy John surgery, Shoulder soreness]"
2,Zack Littell,"[RP, RP]","[Strained hamstring, Elbow inflammation]"
3,Yonny Chirinos,"[SP, SP]","[Triceps inflammation, Tommy John surgery]"
4,Joely Rodríguez,"[RP, RP, RP, RP]","[Strained lat, Strained hamstring, Strained ob..."
...,...,...,...
186,Brandon Hughes,"[RP, RP]","[Knee inflammation, Knee inflammation]"
187,Andrew Bellatti,"[RP, RP]","[Strained shoulder, Triceps tendinitis]"
188,Germán Márquez,"[SP, SP]","[Strained forearm, Tommy John surgery]"
189,Vince Velasquez,"[SP, SP]","[Elbow inflammation, Elbow discomfort]"


### 1-2. CSV 생성

In [13]:
pitchers = df_duplicate_players[['Name', 'Injury']]
#pitchers.to_csv('pitchers.csv', index=False)
pitchers = pd.read_csv('pitchers.csv', encoding = 'cp949')

In [14]:
pitchers.head(5)

Unnamed: 0,Name,Injury
0,임찬규,"['Back tightness', 'Knee contusion' ,'Sprained..."
1,이우찬,['Elbow tendinitis']
2,최성훈,['Flexor tendon surgery']
3,백승현,['Strained oblique']
4,이민호,['Elbow inflammation']


### 1-3. 부상행렬 생성
##### pitchers의 injury 칼럼 추출 -> 변수 생성

In [20]:
import ast
pitchers_injury = pitchers['Injury'].apply(ast.literal_eval).values
pitchers_injury

array([list(['Back tightness', 'Knee contusion', 'Sprained elbow']),
       list(['Elbow tendinitis']), list(['Flexor tendon surgery']),
       list(['Strained oblique']), list(['Elbow inflammation']),
       list(['Shoulder inflammation', 'Shoulder discomfort']),
       list(['Shoulder inflammation', 'Shoulder discomfort']),
       list(['Tommy John surgery', 'Shoulder soreness']),
       list(['Strained hamstring', 'Elbow inflammation']),
       list(['Triceps inflammation', 'Tommy John surgery']),
       list(['Strained lat', 'Strained hamstring', 'Strained oblique', 'Shoulder inflammation']),
       list(['Elbow discomfort', 'Tommy John surgery', 'Finger discomfort', 'Shoulder discomfort']),
       list(['Elbow inflammation', 'Elbow surgery (loose bodies)']),
       list(['Triceps tendinitis', 'Shoulder fatigue']),
       list(['Thumb surgery (laceration)', 'Strained lat']),
       list(['Back tightness', 'Strained forearm']),
       list(['Strained groin', 'Strained shoulder', 'El

In [17]:
list(pitchers_injury)[0]

['Back tightness', 'Knee contusion', 'Sprained elbow']

In [18]:
len(pitchers_injury)

197

In [19]:
injury_dataset = list(pitchers_injury)
injury_dataset

[['Back tightness', 'Knee contusion', 'Sprained elbow'],
 ['Elbow tendinitis'],
 ['Flexor tendon surgery'],
 ['Strained oblique'],
 ['Elbow inflammation'],
 ['Shoulder inflammation', 'Shoulder discomfort'],
 ['Shoulder inflammation', 'Shoulder discomfort'],
 ['Tommy John surgery', 'Shoulder soreness'],
 ['Strained hamstring', 'Elbow inflammation'],
 ['Triceps inflammation', 'Tommy John surgery'],
 ['Strained lat',
  'Strained hamstring',
  'Strained oblique',
  'Shoulder inflammation'],
 ['Elbow discomfort',
  'Tommy John surgery',
  'Finger discomfort',
  'Shoulder discomfort'],
 ['Elbow inflammation', 'Elbow surgery (loose bodies)'],
 ['Triceps tendinitis', 'Shoulder fatigue'],
 ['Thumb surgery (laceration)', 'Strained lat'],
 ['Back tightness', 'Strained forearm'],
 ['Strained groin',
  'Strained shoulder',
  'Elbow inflammation',
  'Strained shoulder',
  'Strained shoulder'],
 ['Shoulder soreness', 'Arthroscopic shoulder surgery'],
 ['Elbow soreness', 'Arthroscopic elbow surgery'],

### 1-4. 인코딩 : TransactionEncoder

In [22]:
#!pip install mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [25]:
te = TransactionEncoder()
te_result = te.fit_transform(injury_dataset)

In [28]:
injury_matrix = pd.DataFrame(te_result, columns=te.columns_)

In [29]:
injury_matrix

Unnamed: 0,Abscess removal procedure (thigh),Ankle contusion,Ankle discomfort,Ankle impingement,Ankle inflammation,Ankle surgery,Arm fatigue,Arthroscopic elbow surgery,Arthroscopic shoulder surgery,Back discomfort,...,Thumb surgery (torn ligament),Thumb weakness (left),Tommy John surgery,Torn flexor tendon,Torn knee ligament,Triceps inflammation,Triceps tendinitis,Triceps tightness,Ulnar nerve irritation,Wrist tendinitis
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
193,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
194,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
195,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
# Bull값 정수화
injury_matrix = injury_matrix.astype(int)
injury_matrix

Unnamed: 0,Abscess removal procedure (thigh),Ankle contusion,Ankle discomfort,Ankle impingement,Ankle inflammation,Ankle surgery,Arm fatigue,Arthroscopic elbow surgery,Arthroscopic shoulder surgery,Back discomfort,...,Thumb surgery (torn ligament),Thumb weakness (left),Tommy John surgery,Torn flexor tendon,Torn knee ligament,Triceps inflammation,Triceps tendinitis,Triceps tightness,Ulnar nerve irritation,Wrist tendinitis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
194,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
import numpy as np

arr1 = injury_matrix.values
arr1

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## 2. ALS(Alternating Least Squares)
### 2-1. AlternatingLeastSquares 클래스 생성

In [None]:
import numpy as np
from tqdm import tqdm_notebook as tqdm

class AlternatingLeastSquares():
    def __init__(self, R, k, reg_param, epochs, verbose=False):
        """
        :param R: rating matrix
        :param k: latent parameter
        :param learning_rate: alpha on weight update
        :param reg_param: beta on weight update
        :param epochs: training epochs
        :param verbose: print status
        """
        self._R = R
        self._num_users, self._num_items = R.shape
        self._k = k
        self._reg_param = reg_param
        self._epochs = epochs
        self._verbose = verbose


    def fit(self):
        # init latent features
        self._users = np.random.normal(size=(self._num_users, self._k))
        self._items = np.random.normal(size=(self._num_items, self._k))

        # train while epochs
        self._training_process = []
        self._user_error = 0; self._item_error = 0; 
        for epoch in range(self._epochs):
            for i, Ri in enumerate(self._R):
                self._users[i] = self.user_latent(i, Ri)
                self._user_error = self.cost()
                
            for j, Rj in enumerate(self._R.T):
                self._items[j] = self.item_latent(j, Rj)
                self._item_error = self.cost()
                
            cost = self.cost()
            self._training_process.append((epoch, cost))

            # print status
            if self._verbose == True and ((epoch + 1) % 10 == 0):
                print("Iteration: %d ; cost = %.4f" % (epoch + 1, cost))


    def cost(self):
        """
        compute root mean square error
        :return: rmse cost
        """
        xi, yi = self._R.nonzero()
        cost = 0
        for x, y in zip(xi, yi):
            cost += pow(self._R[x, y] - self.get_prediction(x, y), 2)
        return np.sqrt(cost/len(xi))


    def user_latent(self, i, Ri):
        """
        :param error: rating - prediction error
        :param i: user index
        :param Ri: Rating of user index i
        :return: convergence value of user latent of i index
        """

        du = np.linalg.solve(np.dot(self._items.T, np.dot(np.diag(Ri), self._items)) + self._reg_param * np.eye(self._k),
                                   np.dot(self._items.T, np.dot(np.diag(Ri), self._R[i].T))).T
        return du

    def item_latent(self, j, Rj):
        """
        :param error: rating - prediction error
        :param j: item index
        :param Rj: Rating of item index j
        :return: convergence value of itemr latent of j index
        """

        di = np.linalg.solve(np.dot(self._users.T, np.dot(np.diag(Rj), self._users)) + self._reg_param * np.eye(self._k),
                                 np.dot(self._users.T, np.dot(np.diag(Rj), self._R[:, j])))
        return di


    def get_prediction(self, i, j):
        """
        get predicted rating: user_i, item_j
        :return: prediction of r_ij
        """
        return self._users[i, :].dot(self._items[j, :].T)


    def get_complete_matrix(self):
        """
        :return: complete matrix R^
        """
        return self._users.dot(self._items.T)

if __name__ == "__main__":
    R = arr1

### 2-2. ALS 활용 학습

In [None]:
# als = AlternatingLeastSquares(R = R, reg_param = 0.01, epochs=100, verbose=True, k=3)
# als.fit()

In [None]:
## 학습시킨 matrix 저장
# arr = als.get_complete_matrix()

## array형태를 df으로 변환
# injury_recsys = pd.DataFrame(arr)
# injury_recsys.columns = df.columns.tolist()

##스케일링
## scaler = MinMaxScaler()

##각 열 Min-Max 스케일링 적용 (열로 스케일링 진행한 이유 -> 각 질병마다 발병 최대 가능성과 최소가능성으로 스케일링 )
##scaled_df = pd.DataFrame(scaler.fit_transform(injury_recsys.values), columns = injury_recsys.columns)

# print(injury_recsys)

In [None]:
#injury_list = injury_recsys.iloc[5].sort_values(ascending =False).head(3)
#list(injury_list.index[0:3])

### 2-3. 부상 확률 매트릭스 csv 저장

In [None]:
# 정리한 injury_recsys
# injury_recsys.to_csv('injury_recsys.csv', index=False)

### 2-4. 부상 확률 매트릭스 불러오기

In [32]:
injury_recsys = pd.read_csv('injury_recsys.csv')

In [33]:
injury_list = injury_recsys.iloc[5].sort_values(ascending =False).head(3)

In [34]:
injury_list.index[0]

'Elbow discomfort'