In [0]:
# !pip install optuna

In [0]:
import pyspark.sql.functions as F

In [0]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pickle

# from sklearn.model_selection import train_test_split, StratifiedKFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, f1_score, roc_auc_score
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA
# from sklearn import svm

# import optuna

In [0]:
# pd.set_option('display.max_columns', None)

# Loading data

In [0]:
# # PATH = '../data/'
# PATH = 'https://raw.githubusercontent.com/rafabandoni/nfl-predict/refs/heads/main/data/'
# MODEL_PATH = '../model/'

## Score data
This data is the one we will use as input to our prediction

In [0]:
score_historical = spark.sql(
    'select * from workspace.nfl_data.spreadspoke_scores'
)
score_historical = score_historical[[
    'schedule_date',
    'schedule_season',
    'schedule_week',
    'schedule_playoff',
    'team_home',
    'score_home',
    'score_away',
    'team_away',
    'stadium_neutral'
]]
display(score_historical)

Translate team name for pattern.

In [0]:
# Change Commanders name to get historic
commanders_name = [
    'Washington Commanders',
    'Washington Football Team',
    'Washington Redskins'
]

score_historical = score_historical.withColumn(
  'team_home',
  F.when(
    F.col('team_home').isin(commanders_name),
    F.lit('Washington Commanders')
  ).otherwise(F.col('team_home'))
).withColumn(
  'team_away',
  F.when(
    F.col('team_away').isin(commanders_name),
    F.lit('Washington Commanders')
  ).otherwise(F.col('team_away'))
)

In [0]:
def get_first_name_team(df, home_or_away):
    df = df.withColumn(
        home_or_away,
        F.element_at(
            F.split(F.col(home_or_away), ' '),
            -1
        )
    )
    return df

score_historical = get_first_name_team(score_historical, 'team_home')
score_historical = get_first_name_team(score_historical, 'team_away')

Creating target column, this will be used to teach our model.

In [0]:
score_historical = score_historical.withColumn(
    'is_home_winner',
    F.when(
        F.col('score_home') > F.col('score_away'),
        F.lit(1)
    ).otherwise(F.lit(0))
)

This is our history to enrich our input. Just like the input, but with scores.

In [0]:
display(score_historical)

## Stats Data
This data will enrich our input to improve our prediction.

In [0]:
# Iter over a list so we don't need to call each df individually
data_list = [
  'defense_downs',
  'defense_fumbles',
  'defense_interceptions',
  'defense_passing',
  'defense_receiving',
  'defense_rushing',
  'defense_scoring',
  'defense_tackles',
  'offense_downs',
  'offense_passing',
  'offense_receiving',
  'offense_rushing',
  'offense_scoring',
  'special_teams_field_goals',
  'special_teams_kickoff_returns',
  'special_teams_kickoffs',
  'special_teams_punt_returns',
  'special_teams_punting',
  'special_teams_scoring',
]

In [0]:
dataframe_dict = {}
for item in data_list:
  data = spark.sql(f'select * from workspace.nfl_data.{item}')
  dataframe_dict[item] = data

In [0]:
# Unpack dict into each of the dataframe variables
(
  defense_downs,
  defense_fumbles,
  defense_interceptions,
  defense_passing,
  defense_receiving,
  defense_rushing,
  defense_scoring,
  defense_tackles,
  offense_downs,
  offense_passing,
  offense_receiving,
  offense_rushing,
  offense_scoring,
  special_teams_field_goals,
  special_teams_kickoff_returns,
  special_teams_kickoffs,
  special_teams_punt_returns,
  special_teams_punting,
  special_teams_scoring
) = tuple(dataframe_dict.values())

In [0]:
# Checking result
display(offense_rushing)

In [0]:
def get_df_name(df):
  name =[x for x in globals() if globals()[x] is df][0]
  return name

def change_dataframes_columns(dataframes_list):
  keep_names = ['year', 'Team']
  for df in dataframes_list:
    df_name = get_df_name(df)
    new_columns = []
    for column in df.columns:
      if not column in (keep_names):
        new_name = f'{df_name}_{column}'
      else:
        new_name = column
      new_columns.append(new_name)
    df.columns = new_columns

def merge_dataframes(dataframes_list):
  new_dataframe = dataframes_list[0].copy()
  for df in dataframes_list[1:]:
    new_dataframe = new_dataframe.merge(df,
                                        on=['year', 'Team'],
                                        how='left')
  return new_dataframe

In [0]:
defense_dfs = [
  defense_downs,
  defense_fumbles,
  defense_interceptions,
  defense_passing,
  defense_receiving,
  defense_rushing,
  defense_scoring,
  defense_tackles,
]

change_dataframes_columns(defense_dfs)
defense_dataframe = merge_dataframes(defense_dfs)

In [0]:
offense_dfs = [
  offense_downs,
  offense_passing,
  offense_receiving,
  offense_rushing,
  offense_scoring,
]

change_dataframes_columns(offense_dfs)
offense_dataframe = merge_dataframes(offense_dfs)

In [0]:
special_teams_dfs = [
  special_teams_field_goals,
  special_teams_kickoff_returns,
  special_teams_kickoffs,
  special_teams_punt_returns,
  special_teams_punting,
  special_teams_scoring
]

change_dataframes_columns(special_teams_dfs)
special_teams_dataframe = merge_dataframes(special_teams_dfs)

In [0]:
stats_dataframe = defense_dataframe.merge(offense_dataframe,
                                          on=['year', 'Team'],
                                          how='left')

stats_dataframe = stats_dataframe.merge(special_teams_dataframe,
                                        on=['year', 'Team'],
                                        how='left')

Fixing team name for pattern.

In [0]:
stats_dataframe['Team'].sort_values().unique()

In [0]:
teams_to_translate = {
    'Niners' : '49ers',
    'Redskins' : 'Commanders',
    'FootballTeam' : 'Commanders'
}

def translate_team_names(team_name, teams_to_translate):
  if team_name in teams_to_translate.keys():
    return teams_to_translate.get(team_name)
  else:
    return team_name

stats_dataframe['Team'] = stats_dataframe.apply(lambda row: translate_team_names(row['Team'], teams_to_translate), axis=1)

Lagging year so we can use last year as stats for current predicting data.

In [0]:
stats_dataframe['year'] = stats_dataframe['year'] + 1 # the last year stats refers to this years game
stats_dataframe.rename(columns={'year' : 'stats_year'}, inplace=True)

This is our stats data so far.

In [0]:
stats_dataframe.head()

### Improving stats dataframe

In [0]:
# Let's make some functions to clean our stats dataframe
# Cleaning % data
def clean_percent_data(df):
  for column in df.columns:
    if '%' in column:
      df[column] = df[column] / 100

# Remove special characters from columns
def remove_special_char_columns(df):
  for column in df.columns:
    new_name = column.lower().replace(' ','_').replace('%', '_perc')
    df.rename(columns={
        column : new_name
    }, inplace=True)

# Transform turnover in negative data
def negative_turnover_number(turnover_columns, df):
  for column in turnover_columns:
    new_column = []
    for item in df[column]:
      if 'T' in item:
        new_value = item.replace('T', '')
        new_value = int(new_value)
        new_value = new_value * -1
      else:
        new_value = int(item)
      new_column.append(new_value)
    df[column] = new_column

In [0]:
turnover_columns = [
    'defense_passing_lng',
    'defense_receiving_lng',
    'defense_rushing_lng',
    'defense_interceptions_lng',
    'offense_passing_lng',
    'offense_receiving_lng',
    'offense_rushing_lng',
    'special_teams_kickoff_returns_lng',
    'special_teams_punt_returns_lng',
    'special_teams_punting_lng'
]

clean_percent_data(stats_dataframe)
remove_special_char_columns(stats_dataframe)
negative_turnover_number(turnover_columns, stats_dataframe)

In [0]:
# Fix columns with A_M (attemps_made)
a_m_columns = [
    'special_teams_field_goals_1-19_>_a-m',
    'special_teams_field_goals_20-29_>_a-m',
    'special_teams_field_goals_30-39_>_a-m',
    'special_teams_field_goals_40-49_>_a-m',
    'special_teams_field_goals_50-59_>_a-m',
    'special_teams_field_goals_60+_>_a-m',
]

# We will transform it in a percent so we don't need to create a new column for each case
for column in a_m_columns:
  attps_list = stats_dataframe[column].str.split('_').str[0]
  matches_list = stats_dataframe[column].str.split('_').str[1]

  percent_matches = matches_list.astype('int') / attps_list.astype('int')
  percent_matches.fillna(0, inplace=True)
  stats_dataframe[column] = percent_matches
  stats_dataframe.rename(columns={
      column : column.replace('a-m', 'percent_a_m')
  }, inplace=True)

In [0]:
# Removing non important columns (trash from the origin)
columns_to_drop = [
    'data_fgm',
    'data_fg__perc',
    'data_xpm',
    'data_xp_pct',
    'data_kret_td',
    'data_pret_t'
]

stats_dataframe.drop(columns_to_drop,
                     axis=1,
                     inplace=True)

In [0]:
stats_dataframe.head()

# Creating train, validation and test dataframe
Now we need to create a class to join stats with our input and create new features if needed. The pipeline will do as follow:
1. We read a dataframe with the data we want to predict;
2. We append that data to the stats dataframe;
3. We create a few more features historic based;
4. We sent this data as input (one line of data).

In [0]:
target = 'is_home_winner'

In [0]:
# train_data = score_historical[
#     (score_historical['schedule_season'] >= 2011) &
#     (score_historical['schedule_season'] < 2024)
# ]
# val_data = score_historical[score_historical['schedule_season'] == 2024]
# X_val = val_data.drop(target, axis=1)
# y_val = val_data[target]

In [0]:
X = score_historical.drop(target, axis=1)
y = score_historical[target]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [0]:
class EnrichInput:
    def __init__(self, historical_df, stats_df, X):
        self.historical_df = historical_df
        self.stats_df = stats_df
        self.X = X
        self.historic = {}

    def calculate_historic(self, row):
        team_1, team_2 = sorted([row['team_home'], row['team_away']])
        key = (team_1, team_2)
        if key not in self.historic:
            self.historic[key] = {'victories_team_1': 0, 'victories_team_2': 0, 'matches': 0}
        else:
            if row['is_home_winner']:
                if row['team_home'] == team_1:
                    self.historic[key]['victories_team_1'] += 1
                else:
                    self.historic[key]['victories_team_2'] += 1
            else:
                if row['team_home'] == team_1:
                    self.historic[key]['victories_team_2'] += 1
                else:
                    self.historic[key]['victories_team_1'] += 1

        self.historic[key]['matches'] += 1
        if row['team_home'] == team_1:
            return self.historic[key]['victories_team_1'] / self.historic[key]['matches'], self.historic[key]['matches']
        else:
            return self.historic[key]['victories_team_2'] / self.historic[key]['matches'], self.historic[key]['matches']

    def create_away_historic(self, row):
        if row['matches'] > 1:
            return 1 - row['home_win_historic_confront']
        else:
            return 0

    def get_game_history(self, historical_df, X):
        _historical_df = historical_df
        _historical_df[['home_win_historic_confront', 'matches']] = _historical_df.apply(self.calculate_historic, axis=1).to_list()
        _historical_df['away_win_historic_confront'] = _historical_df.apply(self.create_away_historic, axis=1)
        _historical_df = _historical_df[[
            'schedule_date',
            'team_home',
            'team_away',
            'home_win_historic_confront',
            'away_win_historic_confront',
            'matches',
        ]]
        X_history = X.merge(
            _historical_df,
            on=['schedule_date', 'team_home', 'team_away'],
            how='left'
        )
        return X_history

    def average_points_per_season(self, historic_df, X):
        home_score_per_season = historic_df[[
            'schedule_season',
            'team_home',
            'score_home'
        ]].groupby([
            'schedule_season',
            'team_home'
        ], as_index=False).mean()
        away_score_per_season = historic_df[[
            'schedule_season',
            'team_away',
            'score_away'
        ]].groupby([
            'schedule_season',
            'team_away'
        ], as_index=False).mean()

        home_score_per_season.rename(columns={'score_home' : 'avg_score_home_last_season'}, inplace=True)
        away_score_per_season.rename(columns={'score_away' : 'avg_score_away_last_season'}, inplace=True)
        home_score_per_season['schedule_season'] = home_score_per_season['schedule_season'] + 1
        away_score_per_season['schedule_season'] = away_score_per_season['schedule_season'] + 1

        X_avg_scores = X.copy()
        X_avg_scores = X_avg_scores.merge(home_score_per_season,
                                          on=['schedule_season', 'team_home'],
                                          how='left')
        X_avg_scores = X_avg_scores.merge(away_score_per_season,
                                          on=['schedule_season', 'team_away'],
                                          how='left')
        return X_avg_scores

    def total_points_last_season(self, historic_df, X):
        slice_df = historic_df.copy()
        slice_df['schedule_season'] = slice_df['schedule_season'] + 1
        home_score_per_season = slice_df[[
            'schedule_season',
            'team_home',
            'score_home'
        ]].groupby([
            'schedule_season',
            'team_home'
        ], as_index=False).sum()
        away_score_per_season = slice_df[[
            'schedule_season',
            'team_away',
            'score_away'
        ]].groupby([
            'schedule_season',
            'team_away'
        ], as_index=False).sum()

        home_score_per_season.rename(columns={'score_home' : 'total_score_home_last_season'}, inplace=True)
        away_score_per_season.rename(columns={'score_away' : 'total_score_away_last_season'}, inplace=True)

        X_total_scores = X.copy()
        X_total_scores = X_total_scores.merge(home_score_per_season,
                                              on=['schedule_season', 'team_home'],
                                              how='left')
        X_total_scores = X_total_scores.merge(away_score_per_season,
                                              on=['schedule_season', 'team_away'],
                                              how='left')
        return X_total_scores

    def average_in_game_stats(self, stats_df, X):
        X_w_stats = X.merge(
            stats_df,
            left_on=['schedule_season', 'team_home'],
            right_on=['stats_year', 'team'],
            how='left'
        ).merge(
            stats_df,
            left_on=['schedule_season', 'team_away'],
            right_on=['stats_year', 'team'],
            how='left',
            suffixes=('_home_', '_away_')
        )
        return X_w_stats

    def fix_schedule_week(self, row):
        if row not in ['Wildcard', 'Division', 'Conference', 'Superbowl']:
            return 'Regular'
        else:
            return row

    def fix_week_kind(self, X):
        X_treated = X.copy()
        X_treated['schedule_week'] = X_treated['schedule_week'].apply(self.fix_schedule_week)
        return X_treated

    def last_adjusts(self, X):
        X_final = X.copy()
        X_final.drop([
            'team_home_',
            'team_away_',
            'schedule_date',
            'schedule_season',
            'team_home',
            'team_away',
        ], axis=1, inplace=True)
        return X_final

    def run(self):
        _X = self.X.copy()
        if any(item in ['score_home', 'score_away'] for item in _X.columns):
            _X.drop(['score_home', 'score_away'], axis=1, inplace=True)
        _historic = self.historical_df.copy()
        _stats = self.stats_df.copy()
        final_X = self.get_game_history(_historic, _X)
        final_X = self.average_points_per_season(_historic, final_X)
        final_X = self.total_points_last_season(_historic, final_X)
        final_X = self.average_in_game_stats(_stats, final_X)
        final_X = self.fix_week_kind(final_X)
        final_X = self.last_adjusts(final_X)
        final_X.drop('schedule_week', axis=1, inplace=True)
        final_X.fillna(0, inplace=True)
        return final_X


In [0]:
X_train_enriched = EnrichInput(
    score_historical,
    stats_dataframe,
    X_train,
).run()
X_train_enriched.head()

In [0]:
X_test_enriched = EnrichInput(
    score_historical,
    stats_dataframe,
    X_test,
).run()
X_test_enriched.head()

# Model

In [0]:
X_train_enriched, X_val, y_train, y_val = train_test_split(X_train_enriched, y_train, test_size=0.2, random_state=42)

In [0]:
def plot_results(y_test, y_pred, model):
  score = f1_score(y_test, y_pred)
  print(f'F1 score: {score}')
  print(f'AUC ROC score: {roc_auc_score(y_test, y_pred)}')
  print('Classification report:')
  print(classification_report(y_test, y_pred))

  print('Confusion matrix:')
  cf_matrix = confusion_matrix(y_test, y_pred)
  disp = ConfusionMatrixDisplay(confusion_matrix=cf_matrix,
                                display_labels=model.classes_)
  disp.plot(values_format='')
  plt.show()

## Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
C_list = [0.001, 0.01, 0.1, 1, 10, 100]

f1_scores = []
auc_scores = []

for c in C_list:
  logistic = LogisticRegression(max_iter=250, solver='liblinear', C=c)
  logistic.fit(X_train_enriched, y_train)

  preds = logistic.predict(X_val)
  f1_scores.append(f1_score(y_val, preds))
  auc_scores.append(roc_auc_score(y_val, preds))

In [0]:
pd.DataFrame({
    'C' : C_list,
    'f1' : f1_scores,
    'auc' : auc_scores,
})

In [0]:
logistic = LogisticRegression(max_iter=250, solver='liblinear', C=10)
logistic.fit(X_train_enriched, y_train)

logistic_predicts = logistic.predict(X_test_enriched)
plot_results(y_test, logistic_predicts, logistic)

## SVM

In [0]:
C_list = [0.001, 0.01, 0.1, 1, 10, 100]

f1_scores = []
auc_scores = []

for c in C_list:
  svm_model = svm.SVC(kernel='rbf', C=c)
  svm_model.fit(X_train_enriched, y_train)

  preds = svm_model.predict(X_val)
  f1_scores.append(f1_score(y_val, preds))
  auc_scores.append(roc_auc_score(y_val, preds))

pd.DataFrame({
    'Kernel' : C_list,
    'f1' : f1_scores,
    'auc' : auc_scores,
})

In [0]:
svm_model = svm.SVC(kernel='rbf', C=10)
svm_model.fit(X_train_enriched, y_train)

svm_predicts = svm_model.predict(X_test_enriched)
plot_results(y_test, svm_predicts, svm_model)

## LGBM

In [0]:
# TODO

## Redes Neurais

In [0]:
features = len(X_train_enriched.columns)

In [0]:
features

In [0]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 35
batch_size = 512
input_shape = [features]

In [0]:
def transform_tensor(df: pd.DataFrame, is_series=False):
  if is_series:
    tensor_df = torch.tensor(df.astype('float64').values, dtype = torch.float32)
  else:
    tensor_df = torch.tensor(df.astype('float64'), dtype = torch.float32)
  return tensor_df

In [0]:
scaler = StandardScaler()
X_train_enriched_scaled = scaler.fit_transform(X_train_enriched)
X_val_scaled = scaler.transform(X_val)
X_test_enriched_scaled = scaler.fit_transform(X_test_enriched)

In [0]:
X_train_tensor = transform_tensor(X_train_enriched_scaled)
X_val_tensor = transform_tensor(X_val_scaled)
X_test_tensor = transform_tensor(X_test_enriched_scaled)

In [0]:
y_train_tensor = transform_tensor(y_train, is_series=True)
y_val_tensor = transform_tensor(y_val, is_series=True)
y_test_tensor = transform_tensor(y_test, is_series=True)

In [0]:
from torch.utils.data import Dataset

class FeatureDataset(Dataset):

  def __init__(self, X, y):

    self.X = X
    self.y = y

  def __len__(self):

    return len(self.y)

  def __getitem__(self, idx):

    return self.X[idx], self.y[idx]

In [0]:
# train_dataset = FeatureDataset(X_train_tensor, y_train_tensor)
# val_dataset = FeatureDataset(X_val_tensor, y_val_tensor)
# test_dataset = FeatureDataset(X_test_tensor, y_test_tensor)

In [0]:
# from torch.utils.data import DataLoader

# train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = False)
# val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)
# test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

In [0]:
import torch.nn as nn

class Net(nn.Module):
  def __init__(self, input_dim, output_dim):
    super(Net, self).__init__()
    self.fc1 = nn.Linear(input_dim, 128)
    self.fc2 = nn.Linear(128, 64)
    self.output = nn.Linear(64, output_dim)
    self.activation = nn.ReLU()

  def forward(self, X):
    X = self.activation(self.fc1(X))
    X = self.activation(self.fc2(X))
    X = self.output(X)
    return X

In [0]:
from torch import optim

nn_model = Net(input_dim=features, output_dim=len(y_train.unique())).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(nn_model.parameters())

In [0]:
def accuracy_fn(y_true, y_pred):
  correct = torch.eq(y_true, y_pred).sum().item()
  acc = (correct / len(y_pred)) * 100
  return acc

In [0]:
torch.manual_seed(42)

train_acc = []
val_acc = []

for epoch in range(1, epochs+1):
  nn_model.train()

  # for features, labels in train_loader:
  labels = y_train_tensor.squeeze()
  labels = labels.long()

  model_output = nn_model(X_train_tensor)
  y_pred = model_output.argmax(dim=1)
  acc = accuracy_fn(
      y_true=labels,
      y_pred=y_pred,
  )

  loss = criterion(model_output, labels)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  nn_model.eval()
  with torch.inference_mode():
    # for features, labels in val_loader:
    y_val_logits = nn_model(X_val_tensor)
    y_val_pred = y_val_logits.argmax(dim=1)
    acc_val = accuracy_fn(
        y_true = y_val_tensor,
        y_pred = y_val_pred
    )

  train_acc.append(acc)
  val_acc.append(acc_val)

  if epoch % 5 == 0:
    print(f'Epoch: {epoch} | Loss: {loss:.5f}, Acc: {acc:.2f}% | Val acc: {acc_val:.2f}%')

In [0]:
df = pd.DataFrame({
    'Epochs' : [i for i in range(1, epochs+1)],
    'Train Acc' : train_acc,
    'Val Acc' : val_acc,
})

plt.figure(figsize=(12,8))
sns.lineplot(data=df, x='Epochs', y='Train Acc', label='Train Acc')
sns.lineplot(data=df, x='Epochs', y='Val Acc', label='Val acc')
plt.xlabel('Accuracy')
plt.ylabel('Epoch')
plt.show()

In [0]:
with torch.inference_mode():
    y_nn_test_logits = nn_model(X_test_tensor)
    y_nn_test_pred = y_nn_test_logits.argmax(dim=1)

plot_results(y_test, y_nn_test_pred, svm_model)

# Old

In [0]:
# def svc_objective(trial):
#     params = {
#         'C': trial.suggest_float('C', 0.001, 100),
#         'gamma': trial.suggest_float('gamma', 0.0001, 10),
#     }

#     acc_scores = []
#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     for train_index, test_index in cv.split(X_train_enriched, y_train):
#         X_train_fold, X_test_fold = X_train_enriched.iloc[train_index], X_train_enriched.iloc[test_index]
#         y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

#         pipe = Pipeline([
#             ('scaler', StandardScaler()),
#             ('pca', PCA()),
#             ('scv', svm.SVC(kernel='rbf', **params)),
#         ])

#         pipe.fit(X_train_fold, y_train_fold)
#         y_pred_fold = pipe.predict(X_test_fold)

#         acc = accuracy_score(y_test_fold, y_pred_fold)
#         acc_scores.append(acc)
#         # auc = roc_auc_score(y_test, y_pred_fold)
#     return np.mean(acc_scores)

# svc_study = optuna.create_study(direction='maximize', study_name='svc_objective')
# svc_study.optimize(svc_objective, n_trials=5)
# svc_best_params = svc_study.best_params
# print(f"Best parameters: {svc_best_params}")

In [0]:
# optuna.visualization.plot_optimization_history(svc_study).show()

In [0]:
# optuna.visualization.plot_param_importances(svc_study).show()

In [0]:
# optuna.visualization.plot_slice(svc_study).show()

In [0]:
# svc_model = Pipeline([
#     ('scaler', StandardScaler()),
#     ('pca', PCA()),
#     ('scv', svm.SVC(kernel='rbf', **svc_best_params)),
# ])

# svc_model.fit(X_train_enriched, y_train)
# y_pred = svc_model.predict(X_test_enriched)

# score = f1_score(y_test, y_pred)
# print(f'F1 score: {score}')
# print(f'AUC ROC score: {roc_auc_score(y_test, y_pred)}')
# print('Classification report:')
# print(classification_report(y_test, y_pred))

# print('Confusion matrix:')
# cf_matrix = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cf_matrix,
#                               display_labels=svc_model.classes_)
# disp.plot(values_format='')
# plt.show()

In [0]:
# # save
# with open(MODEL_PATH + 'model_clf.pkl','wb') as f:
#     pickle.dump(pipe_clf,f)