In [1]:
import os
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# TODO 資料準備
# 賽試資料
matches = pd.read_csv(os.getcwd() + '/data/82/matches.csv')
# 篩選完賽資料
matches = matches.loc[matches.status == 8]
# 移除不必要欄位
matches = matches.drop(
    columns=[
        'status', 
        'competition_id', 
        'home_half_score', 
        'home_red', 
        'home_yellow',
        'home_corner',
        'away_half_score', 
        'away_red', 
        'away_yellow',
        'away_corner'
        ])

# 删除有NaN rows
matches = matches.dropna()

In [3]:
# 統計賽果資料 主、客、和 次數
def statisticalResults(data, dataType):
    openDate = data['open_date']
    home = data['home_team']
    away = data['away_team']

    if dataType == 'history':
        df = matches.loc[
            (matches.open_date < openDate) & 
            (matches.home_team == home) &
            (matches.away_team == away)
        ] 
    else:
        df = pd.concat([
            matches.loc[(matches.open_date < openDate) & \
                        (matches.home_team == home)],
            matches.loc[(matches.open_date < openDate) & \
                        (matches.away_team == home)],
        ], axis=0)

    df['result'] = np.where(
        (df['home_score'] > df['away_score']), 1, np.where(
        (df['home_score'] == df['away_score']), 0, 2))
    
    df = df.sort_values(by=['open_date'],ascending=False).iloc[:15]

    return df.loc[df.result == 1].shape[0], \
            df.loc[df.result == 0].shape[0], \
            df.loc[df.result == 2].shape[0]
    

In [4]:
# 各球隊作為主場球隊時 進球表現
# 各球隊歷史交鋒(相同主客-近15場) 
# 各球隊近期對戰(近15場) 
matches['history_won'],  \
matches['history_drawn'], \
matches['history_lost'] = zip(*matches.apply(statisticalResults, \
                                args = ('history',), axis = 1))

matches['recent_won'],  \
matches['recent_drawn'], \
matches['recent_lost'] = zip(*matches.apply(statisticalResults, \
                                args = ('recent',), axis = 1))

In [None]:
matches

In [None]:
# 換算 歷史對戰、近期對戰 主隊贏球概率
matches['history_total'] = matches['history_won'] + \
                            matches['history_drawn'] + \
                            matches['history_lost']
matches['history_rate'] = matches['history_won'] / matches['history_total'] 
                            
matches['recent_total'] = matches['recent_won'] + \
                            matches['recent_drawn'] + \
                            matches['recent_lost']
matches['recent_rate'] = matches['recent_won'] / matches['recent_total'] 

In [None]:
# 清除null值
matches = matches.dropna()

# 移除 歷史 近期賽事 總場次為0 賽事
matches = matches[~(matches['history_total'].isin([0]))]
matches = matches[~(matches['recent_rate'].isin([0]))]

# 移除不必要欄位
matches = matches.drop(
    columns=[
        'open_date', 
        'history_won', 
        'history_drawn', 
        'history_lost', 
        'history_total',
        'recent_won',
        'recent_drawn',
        'recent_lost',
        'recent_rate'
        ])

In [None]:
matches

In [None]:
# 整理情報資料
intelligences = pd.read_csv(os.getcwd() + '/data/82/info.csv')

# 移除重複資料
intelligences.drop_duplicates(
    ['match_id','info_type','team_info','level','content'],
    keep='first',
    inplace=True
)

# 移除 level 0 情報
intelligences = intelligences[~(intelligences['level'].isin([0]))]

# 分主客、情報好壞、level 計算數量
intelligences = intelligences.groupby(['match_id', 'info_type', 'level'])                           ['content'].size().reset_index(name='count')

# 計算情報權重 = 筆數 * level
intelligences['weights'] = intelligences['level'] * intelligences['count']

In [None]:
# TODO 構造特徵

# 主場球隊名
# 客場球隊名
# 全場 主場球隊進球數
# 全場 客場球隊進球數
# 比賽結果 ( H= 主場贏, D= 平局, A= 客場贏)
# 比賽結果 ( 比分 )
# 主場淨勝球數
# 客場的淨胜球數
# 主場累計得分
# 客場累計得分
# Poisson各比分機率

# 統計某支隊伍最近三場比賽的表現
# HM(n) 代表主場球隊上一次比賽的輸贏，
# AM(n) 代表客場球隊上一次比賽是輸贏。

# 加入比賽輪次為特徵（第幾個輪次MW）

In [None]:
# TODO 清洗資料
# 每場賽事特徵值皆是依據過往比賽所統計出來，因此最一開始前幾比資料無法統計信息不足，必須拋棄。

In [None]:
# TODO 分析敘述資料、解決樣本不均衡(暫定)
# 依照先前統計發現主場獲勝的比例接近 50% ，對於三分類(主、客、和)，標籤比例是不均衡的。
# 因此需簡化為二分類問題，也就是主場球隊會不會勝利。

In [None]:
# TODO 將數據拆分為「特徵值」與「標籤值」

In [None]:
# TODO 數據標準化
# 主場淨勝球數、客場的淨胜球數、主場累計得分、客場累計得分 數據內容區間範圍太大因此需正規劃

In [None]:
# TODO 轉換特徵數據型態、預覽

In [None]:
# TODO 數據關聯圖形化描述
# 皮爾遜相關熱力圖

In [None]:
# TODO 再次清潔數據
# 依照關聯圖移除無異議數據

In [None]:
# TODO 切割數據
# 將數據集隨機分成為訓練集和測試集

In [None]:
# TODO 建立模型
# 支持向量機(SVM)

In [None]:
# TODO 結果評估

In [None]:
# TODO 參數調整

In [None]:
# TODO 結論

In [None]:
# 屍體 2 
# # 合併整合資料
# consolidationDF = pd.merge(matches, historyDF, how='outer', on=['match_id']) 
# consolidationDF = pd.merge(consolidationDF, recentDF, how='outer', on=['match_id']) 
# consolidationDF = consolidationDF.dropna()

In [None]:
# 屍體
# # 各球隊作為主場球隊時 進球表現
# # 各球隊歷史交鋒(相同主客-近15場) 
# # 各球隊近期對戰(近15場) 
# history = []
# recent = []
# for index, row in matches.iterrows():
#     matchId = row['match_id']
#     openDate = row['open_date']
#     home = row['home_team']
#     away = row['away_team']
#     df = matches.loc[
#         (matches.open_date < openDate) & 
#         (matches.home_team == home) &
#         (matches.away_team == away)] 
    
#     df['result'] = np.where(
#         (df['home_score'] > df['away_score']), 1, np.where(
#         (df['home_score'] == df['away_score']), 0, 2))
    
#     df = df.sort_values(by=['open_date'],ascending=False).iloc[:15]

#     history.append({
#         'match_id': matchId,
#         'history_won': df.loc[df.result == 1].shape[0],
#         'history_drawn': df.loc[df.result == 0].shape[0],
#         'history_lost': df.loc[df.result == 2].shape[0]
#     })

#     df = pd.concat([
#         matches.loc[(matches.open_date < openDate) & (matches.home_team == home)],
#         matches.loc[(matches.open_date < openDate) & (matches.away_team == home)],
#         ], axis=0)

#     df['result'] = np.where(
#         (df['home_score'] > df['away_score']), 1, np.where(
#         (df['home_score'] == df['away_score']), 0, 2))

#     df = df.sort_values(by=['open_date'],ascending=False).iloc[:15]

#     recent.append({
#         'match_id': matchId,
#         'recent_won': df.loc[df.result == 1].shape[0],
#         'recent_drawn': df.loc[df.result == 0].shape[0],
#         'recent_lost': df.loc[df.result == 2].shape[0]
#     })

# historyDF = pd.DataFrame.from_dict(history)
# historyDF['history_rate'] = historyDF['history_won'] / (historyDF['history_won'] + historyDF['history_drawn'] + historyDF['history_lost'])

# recentDF = pd.DataFrame.from_dict(recent)
# recentDF['recent_rate'] = recentDF['recent_won'] / (recentDF['recent_won'] + recentDF['recent_drawn'] + recentDF['recent_lost'])