### Set up

In [None]:
import os 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score, accuracy_score, log_loss
from sklearn.datasets import make_classification
from sklearn.utils import shuffle

In [None]:
RANDOM_SEED = 0
# Define BASE_URL for local computing
BASE_URL = os.getcwd()

In [None]:
# Import datasets
application_record_file_path = os.path.join(BASE_URL, 'credit_card_approval/application_record.csv')
credit_record_file_path = os.path.join(BASE_URL, 'credit_card_approval/credit_record.csv')

application_record = pd.read_csv(application_record_file_path)
credit_record = pd.read_csv(credit_record_file_path)

### application_record dataset preprocessing

In [None]:
"""
Seems like occupation_type has many null values.
"""
application_record.info()

In [None]:
# In 'OCCUPATION_TYPE', replace NaN entries with 'Other'
application_record['OCCUPATION_TYPE'].fillna('Other', inplace=True)


In [None]:
"""
Total number of rows is 438,557, while unique IDs are 438,510.
That means the dataset has 47 duplicates.
"""
application_record['ID'].nunique()

In [None]:
# Sort the entries by the 'ID' column, to have a better view on the duplicates
application_record = application_record.sort_values(by='ID')
duplicates = application_record[application_record.duplicated(subset='ID', keep=False)]
duplicates

In [None]:
"""
It seems like the duplicates don't refer to the same customer.
We can also exclude the hypothesis that we are dealing with joint accounts,
since data related to 'CNT_CHILDREN' and 'NAME_FAMILY_STATUS' don't coincide.
With these premises, possibly the best thing to do is to delete all the
duplicates, without keeping any of them.
"""
application_record = application_record.drop_duplicates(subset='ID', keep=False)
application_record['ID'].nunique() # now we have 438,510 - 47 = 438,463 entries

In [None]:
# Make the column names more readable
new_column_names = {
    'CODE_GENDER': 'GENDER',
    'AMT_INCOME_TOTAL': 'INCOME_TOTAL',
    'FLAG_OWN_CAR': 'OWN_CAR',
    'FLAG_OWN_REALTY': 'OWN_REALTY',
    'NAME_INCOME_TYPE': 'INCOME_TYPE',
    'NAME_EDUCATION_TYPE': 'EDUCATION_TYPE',
    'NAME_FAMILY_STATUS': 'FAMILY_STATUS',
    'NAME_HOUSING_TYPE': 'HOUSING_TYPE',
    'CNT_FAM_MEMBERS': 'FAM_MEMBERS'
}

application_record.rename(columns=new_column_names, inplace=True)

In [None]:
"""
It turned out that all the entries with 'INCOME_TYPE' = 'Pensioner' also show
'DAYS_EMPLOYED' = 365243. As this is the only positive value in the column, we
can conclude that all the other customers are currently working. That means
all the customers here have a monthly income.
"""
# Create 'AGE' feature
application_record['AGE'] = (- application_record['DAYS_BIRTH'] / 365.25).astype(int)
application_record.drop('DAYS_BIRTH', axis=1, inplace=True)

# Create 'YEARS_EMPLOYED' feature and drop 'DAYS_EMPLOYED'
application_record['YEARS_EMPLOYED'] = (- application_record['DAYS_EMPLOYED'] / 365.25).astype(int)
application_record.loc[application_record['YEARS_EMPLOYED'] < 0,'YEARS_EMPLOYED'] = 0
application_record.drop('DAYS_EMPLOYED', axis=1, inplace=True)
application_record.head()

In [None]:
"""
# Label Encoding Categorical Columns

from sklearn.preprocessing import LabelEncoder

def encode_label(dataframe, column):
    if column in dataframe.columns:
        label_encoder = LabelEncoder()
        dataframe[column] = label_encoder.fit_transform(dataframe[column])
    else:
        print(f"Column '{column}' not found in the DataFrame.")


categorical_columns = ['GENDER', 'OWN_CAR', 'OWN_REALTY', 'INCOME_TYPE', 'EDUCATION_TYPE', 'FAMILY_STATUS', 'HOUSING_TYPE', 'OCCUPATION_TYPE']

# Apply LabelEncoder to each categorical column
for column in categorical_columns:
    encode_label(application_record, column)

application_record.head()
"""


In [None]:
# Map categorical values 
def replace_values(dataframe, column, value_mapping):
    dataframe[column].replace(value_mapping, inplace=True)

gender_mapping = {'M':0, 'F':1}
car_mapping = {'N':0, 'Y':1}
realty_mapping = {'N':0, 'Y':1}

income_type_mapping = {
    'Student': 1, 'Commercial associate': 2,
    'Working': 3, 'Pensioner': 4, 'State servant': 5,
}
education_type_mapping = {
    'Lower secondary': 1, 'Incomplete higher': 2,
    'Secondary / secondary special': 3,
    'Higher education': 3, 'Academic degree': 4
}
family_status_mapping = {
    'Separated': 1, 'Married': 2, 'Civil marriage': 3,
    'Single / not married': 3, 'Widow': 4
}
housing_type_mapping = {
    'Rented apartment': 1, 'Co-op apartment': 2,
    'Municipal apartment': 3, 'With parents': 4,
    'Office apartment': 5, 'House / apartment': 6
}
occupation_type_mapping = {
    'Low-skill Laborers': 1, 'Waiters/barmen staff': 2,
    'Sales staff': 3, 'Low-skill Laborers': 4, 'Cleaning staff': 5,
    'Private service staff': 6, 'Cooking staff': 7, 'Security staff': 8,
    'Drivers': 9, 'HR staff': 10, 'Secretaries': 11, 'Core staff': 12,
    'Laborers': 13, 'Medicine staff': 14, 'Realty agents': 15,
    'Managers': 16, 'High skill tech staff': 17, 'IT staff': 18,
    'Accountants': 19, 'Other': 20,
}
status_mapping = {'C':0, 'X':0.5, '0':1, '1':2, '2':4, '3':8, '4':16, '5':32}

"""
'C': 0 - Paid off that month
'X': 0.5 - No loan for the month
'0': 1 - 1-29 days past due
'1': 2 - 30-59 days past due
'2': 4 - 60-89 days past overdue
'3': 8 - 90-119 days past overdue
'4': 16 - 120-149 days past overdue
'5': 32 - Overdue or bad debts, write-offs for more than 150 days

NOTE on 'STATUS' coefficients: the scores that will be created starting from
this weights will be normalized ahead during this work. As for now, I decided
to assign arbitrary coefficients to any status, that increase quadratically
as the customer doesn't repay his debt within a given time window.

NOTE on 'X': I decided to assign a coefficient of 0.5 to the status 'X'.
In fact, banks usually prefer clients that actually borrow money through the
credit card they've been given. Furthermore, if a client has no credit history
at all, banks will consider the release of a credit card more carefully.
For example, customer 'ID' 5001731 has a record made of 11 entries, in which the
'STATUS' is always 'X'. With the system here developed, the 'ID_SCORE' will be
(11 * 0.5) / (11) = 0.5. This won't affect the card release if all the other
parameters are good, but the client will be slightly penalized with respect to
one that already has a good credit history.

NOTE on INCOME_TYPE: I tried to encode the income types from the least stable
to the most reliable.

NOTE on EDUCATION_TYPE: I encoded the grades from the lowest to the highest,
assigning '3' to both 'Secondary / secondary special' and 'Higher education',
as they seem to refer to the same grade.
"""

replace_values(application_record, 'GENDER', gender_mapping)
replace_values(application_record, 'OWN_CAR', car_mapping)
replace_values(application_record, 'OWN_REALTY', realty_mapping)
replace_values(application_record, 'INCOME_TYPE', income_type_mapping)
replace_values(application_record, 'EDUCATION_TYPE', education_type_mapping)
replace_values(application_record, 'FAMILY_STATUS', family_status_mapping)
replace_values(application_record, 'HOUSING_TYPE', housing_type_mapping)
replace_values(application_record, 'OCCUPATION_TYPE', occupation_type_mapping)
replace_values(credit_record, 'STATUS', status_mapping)

application_record.head()


In [None]:
# change data type into 'int'
application_record = application_record.astype(int)
application_record.dtypes

### credit_record dataset preprocessing

In [None]:
credit_record.info()

In [None]:
"""
This dataset has 45,985 unique rows, against 1,048,575 total entries.
That happens because the column 'MONTHS_BALANCE' has one entry per any single
month, starting from the month in which the credit card has been released to
the customer.
"""
credit_record['ID'].nunique()

In [None]:
# Extract the account age, expressed in months, by getting the lowest entry
# from the 'MONTHS_BALANCE' column for any 'ID', then adding 1 (current month)
month_balance_sum = pd.DataFrame(credit_record.groupby(['ID'])['MONTHS_BALANCE']
                                 .agg(min)).reset_index()
# Rename the column to 'ACCOUNT_AGE'
month_balance_sum.rename(columns={'MONTHS_BALANCE':'ACCOUNT_AGE'}, inplace=True)
# Turn the entries into positive
month_balance_sum['ACCOUNT_AGE']= - month_balance_sum['ACCOUNT_AGE'] + 1
# Merge dataframes according to the 'ID' column
credit_record = pd.merge(month_balance_sum, credit_record, how='inner', on=['ID'])
# Drop 'MONTHS_BALANCE'
credit_record.drop(columns=['MONTHS_BALANCE'], inplace=True)

credit_record.head()

In [None]:
# Correlation matrix of 'credit_record'
correlation_matrix = credit_record.corr()

"""
There are no particular correlations between the 'ACCOUNT_AGE' and the debt 'STATUS'
"""

# Create a heatmap for visualization
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Sum the 'STATUS' coefficient for any 'ID'
status_sum = pd.DataFrame(credit_record.groupby(['ID'])['STATUS']
                                 .sum()).reset_index()
# Rename 'STATUS' into 'STATUS_SUM'
status_sum.rename(columns={'STATUS':'STATUS_SUM'}, inplace=True)
# Merge dataframes according to the 'ID' column
credit_record = pd.merge(status_sum, credit_record, how='inner', on=['ID'])
# Drop 'STATUS'
credit_record.drop(columns=['STATUS'], inplace=True)

credit_record.head()

In [None]:
# Eliminate the duplicates in 'credit_record'
credit_record.drop_duplicates(inplace=True)

# Create column 'CREDIT_SCORE', which is equal to 'STATUS_SUM' divided by
# 'ACCOUNT_AGE'. The lower the score, the better
credit_record['CREDIT_SCORE'] = credit_record['STATUS_SUM'] / credit_record['ACCOUNT_AGE']
credit_record['CREDIT_SCORE'] = credit_record['CREDIT_SCORE'].round(2)

credit_record.head()


In [None]:
credit_record.drop('STATUS_SUM', axis=1, inplace=True)

sorted_credit_record = credit_record.sort_values(by='CREDIT_SCORE', ascending=False)
sorted_credit_record.head()

In [None]:
"""
NOTE: A first threshold has been set, corrensponding to 'CREDIT_SCORE' = '1'.
In particular, if a customer has 'CREDIT_SCORE' >= 1 (originally corresponding to
'STATUS' = '0': 1-29 days past overdue) he's going to be considered a bad customer.
Here are some examples of 'CREDIT_SCORE' and its corrensponding normalized value:
1 = 0.034674
2 = 0.069348
3 = 0.104022
4 = 0.138696
If the coefficient is < 1, card will be released.
"""

### Creating new datasets

In [None]:
# Create the 'IS_GOOD' column
credit_record['IS_GOOD'] = np.where(credit_record['CREDIT_SCORE'] < 1, 1, 0)
credit_record = credit_record.drop(columns='CREDIT_SCORE')
credit_record.head()


In [None]:
# Inner join on the 'ID' column
data = pd.merge(application_record, credit_record, on='ID', how='inner')
data.head()

In [None]:
# Check how many records match in two datasets
len(set(credit_record['ID']).intersection(set(application_record['ID'])))

In [None]:
"""
We have 36457 unique ID values, which correspond to the number of records that match
between the two datasets.
"""
data['ID'].nunique()

In [None]:
data['IS_GOOD'].value_counts()

In [None]:
"""
Percentage of 'IS_GOOD' = 0: (4862 / 36457) * 100 ≈ 13.34%
The dataset is highly unbalanced.
"""
(4862 / 36457) * 100

In [None]:
# Create the correlation matrix

# Check if 'ID' column exists in the DataFrame
if 'ID' in data.columns:
    # Create a copy of the DataFrame without the 'ID' column
    data_corr = data.drop(columns=['ID'])
else:
    # If 'ID' column is not found, use the entire DataFrame
    data_corr = data

correlation_matrix = data_corr.corr()

# Create a heatmap for visualization
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("'data' Correlation Matrix")
plt.show()


In [None]:
"""
'FLAG_MOBIL' has been excluded from the correlation matrix, as the only possible 
value it shows it's 1. For this reson the column is going to be dropped.
"""
data['FLAG_MOBIL'].value_counts()

In [None]:
# Drop the columns that are not useful for the model
"""
, 'ACCOUNT_AGE'
"""
data.drop(columns=['GENDER', 'CNT_CHILDREN', 'FAMILY_STATUS', 'HOUSING_TYPE', 
                    'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 
                    'OCCUPATION_TYPE'], inplace=True)

In [None]:
# Create a csv document for 'data'
data.to_csv('data.csv', index=False)

In [None]:
# Distribution of the target variable
plt.figure(figsize=(6, 4))
sns.countplot(x='IS_GOOD', data=data, palette='Set2')
plt.show()

In [None]:
# Perform a left anti join to get entries in 'application_record' that are not
# in 'credit_record'. On this dataset we will make our predicitons
df_pred = application_record.merge(credit_record, on='ID', how='left', indicator=True).query('_merge == "left_only"')
# Drop the '_merge' column which was added by the query operation
df_pred = df_pred.drop(['_merge', 'GENDER', 'CNT_CHILDREN', 'FAMILY_STATUS', 'HOUSING_TYPE', 'FLAG_MOBIL', 
                                 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'ACCOUNT_AGE', 'OCCUPATION_TYPE',   
                                'IS_GOOD'], axis=1)

df_pred.head()

In [None]:
# Create a csv document for 'df_pred'
df_pred.to_csv('df_pred.csv', index=False)
df_pred.head()

### Train-test split

In [None]:
# Train-test split
"""
Use of the 'stratify' parameter to ensure that the distribution of the target variable
'Response' is preserved in both the training and testing sets.
Set the 'shuffle' parameter to True, to avoid any possible ordering in the dataset.
"""

x = data.drop(columns=['ID', 'IS_GOOD']).values
y = data['IS_GOOD'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3,
                                                    random_state=RANDOM_SEED, stratify=y)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
"""
Given our primary objective, i.e., to understand why a customer's credit card
request has not been accepted, I'll perfomr the SMOTE algorithm.
The SMOTE (Synthetic Minority Oversampling Technique) operates by generating
synthetic minority class observations through interpolation between existing
minority class samples. This oversampling technique will help us
create a more balanced dataset, allowing our machine learning model to learn from
a more representative set of examples.
"""

smote = SMOTE(random_state=RANDOM_SEED, k_neighbors=3)
x_train, y_train = smote.fit_resample(x_train, y_train)

# Checking the new class counts
print('New class 0:', sum(y_train == 0))
print('New class 1:', sum(y_train == 1))


In [None]:
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

### Random Forest Classifier

In [None]:
"""
The model I chose to predict my rsults is the RandomForestClassifier. 
The confusion matrix shows this is the model that reduces false positives the most.
It must be preferred, indeed, a model that minimizes the risk of releasing the
credit card to a customer who is not actually a good one.

Regarding the reasons of the denial, the feature 'YEARS_EMPLOYED' is definitely
the most influent on the decision, showing that a stable job position matches 
a higher chance to repay expenditures.
"""

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
# max_features='sqrt'
RanFor = RandomForestClassifier(n_estimators=256, max_features='sqrt', max_depth=12, random_state=RANDOM_SEED, 
                                min_samples_split=4, min_samples_leaf=16, class_weight={0:1, 1:0.5}, n_jobs=-1,
                                criterion='entropy')
RanFor.fit(x_train, y_train)

In [None]:
y_pred_train = RanFor.predict(x_train)
y_proba_train = RanFor.predict_proba(x_train)
y_pred_test = RanFor.predict(x_test)
y_proba_test = RanFor.predict_proba(x_test)

print("TRAIN REPORT - RandomForestClassifier")
print(classification_report(y_train, y_pred_train))
print("TEST REPORT - RandomForestClassifier")
print(classification_report(y_test, y_pred_test))

In [None]:
# Access the feature importances
feature_importances = RanFor.feature_importances_

# Match feature importances with feature names (assuming you have feature names)
feature_names = ['ID', 'OWN_CAR', 'OWN_REALTY', 'INCOME_TOTAL',
       'EDUCATION_TYPE', 'OCCUPATION_TYPE', 'FAM_MEMBERS', 'AGE', 'YEARS_EMPLOYED', 
       'ACCOUNT_AGE', 'IS_GOOD']

# Create a list of tuples (feature name, importance)
feature_importance_tuples = [(feature, importance) for feature, importance in zip(feature_names, feature_importances)]

# Sort the list by importance (from highest to lowest)
feature_importance_tuples.sort(key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
for feature, importance in feature_importance_tuples:
    print(f"{feature}: {importance}")

In [None]:
from sklearn.metrics import confusion_matrix

# For training data
cm_train = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix (Train):\n", cm_train)

# For test data
cm_test = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix (Test):\n", cm_test)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Calculate ROC curve and AUC for test data
fpr, tpr, thresholds = roc_curve(y_test, y_proba_test[:, 1])
auc = roc_auc_score(y_test, y_proba_test[:, 1])

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
# Applying the model to the test data
df_pred_id = df_pred['ID']
# df_pred = df_pred.drop(['ID'], axis=1)
df_pred_pred = RanFor.predict(df_pred)
df_pred_pred_proba = RanFor.predict_proba(df_pred)

# Creating a new dataframe for the predictions
df_predictions = pd.DataFrame({
    'ID': df_pred_id,
    'IS_GOOD': df_pred_pred,
    'Probability_0': df_pred_pred_proba[:, 0],
    'Probability_1': df_pred_pred_proba[:, 1]
})

df_predictions.to_excel(BASE_URL + 'credit_card_predictions.xlsx', index=False)

In [None]:
df_predictions[df_predictions['IS_GOOD'] == 0]

### GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
GradientBoost = GradientBoostingClassifier(n_estimators=200, learning_rate=0.01, max_depth=8, random_state=RANDOM_SEED)
GradientBoost.fit(x_train, y_train)

In [None]:
y_pred_train = GradientBoost.predict(x_train)
y_proba_train = GradientBoost.predict_proba(x_train)
y_pred_test = GradientBoost.predict(x_test)
y_proba_test = GradientBoost.predict_proba(x_test)

print("TRAIN REPORT - GradientBoostingClassifier")
print(classification_report(y_train, y_pred_train))
print("TEST REPORT - GradientBoostingClassifier")
print(classification_report(y_test, y_pred_test))

### CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier

In [None]:
CatBst = CatBoostClassifier(verbose=50)

CatBst.fit(x_train, y_train, eval_set = (x_test, y_test), early_stopping_rounds=10)

In [None]:
y_pred_train = CatBst.predict(x_train)
y_proba_train = CatBst.predict_proba(x_train)
y_pred_test = CatBst.predict(x_test)
y_proba_test = CatBst.predict_proba(x_test)

print("TRAIN REPORT - CatBoostClassifier")
print(classification_report(y_train, y_pred_train))
print("TEST REPORT - CatBoostClassifier")
print(classification_report(y_test, y_pred_test))

### SVM

In [None]:
from sklearn.svm import SVC

# Hyperparameter tuning for a SVM classification model using Grid Search Cross-Validation
# SVM for Probability Estimation, obtained by setting 'probability' parameter to 'True'
param_grid = {'C': [1.0, 10.0], 'gamma': [0.01, 0.1]}
grid_search = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, cv=3)
grid_search.fit(x_train, y_train)

svc = grid_search.best_estimator_

In [None]:
cv_results = grid_search.cv_results_

mean_test_scores = cv_results['mean_test_score']
std_test_scores = cv_results['std_test_score']

# Print the mean test scores and corresponding hyperparameters
for mean_score, std_score, params in zip(mean_test_scores, std_test_scores, cv_results['params']):
    print(f"Mean Test Score: {mean_score:.4f}, Std Test Score: {std_score:.4f}, Params: {params}")


In [None]:
# Print the best parameters for the model found using 'grid_search'
print(svc)

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
# Create an SVM model with the 'best_estimator_'
svc_1 = SVC(kernel='rbf', probability=True, C=10.0, gamma=0.1)

# Perform k-fold cross-validation
scoring_metrics = ['accuracy', 'neg_log_loss'] 
cv_results = cross_validate(svc, x_train, y_train, cv=5, scoring=scoring_metrics, return_train_score=True)

train_accuracy = cv_results['train_accuracy']
test_accuracy = cv_results['test_accuracy']
train_log_loss = -cv_results['train_neg_log_loss']
test_log_loss = -cv_results['test_neg_log_loss']

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("Train Log Loss:", train_log_loss)
print("Test Log Loss:", test_log_loss)


In [None]:
# Create an SVM model with more generalized estimators
svc_2 = SVC(kernel='rbf', probability=True, C=1, gamma=0.01)

# Perform k-fold cross-validation
scoring_metrics = ['accuracy', 'neg_log_loss'] 
cv_results = cross_validate(svc, x_train, y_train, cv=5, scoring=scoring_metrics, return_train_score=True)

train_accuracy = cv_results['train_accuracy']
test_accuracy = cv_results['test_accuracy']
train_log_loss = -cv_results['train_neg_log_loss']
test_log_loss = -cv_results['test_neg_log_loss']

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("Train Log Loss:", train_log_loss)
print("Test Log Loss:", test_log_loss)

In [None]:
"""
The most notable difference is in the test log loss. The new model has considerably lower 
test log loss values compared to the previous model, which means better probability estimates.
"""

In [None]:
# Create an SVM model with more balanced estimators

svc_3 = SVC(kernel='rbf', probability=True, C=1, gamma=0.1)

# Perform k-fold cross-validation 
scoring_metrics = ['accuracy', 'neg_log_loss'] 
cv_results = cross_validate(svc, x_train, y_train, cv=5, scoring=scoring_metrics, return_train_score=True)

train_accuracy = cv_results['train_accuracy']
test_accuracy = cv_results['test_accuracy']
train_log_loss = -cv_results['train_neg_log_loss']
test_log_loss = -cv_results['test_neg_log_loss']

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("Train Log Loss:", train_log_loss)
print("Test Log Loss:", test_log_loss)

In [None]:
svc_3.fit(x_train, y_train)


In [None]:
y_pred_train = svc_3.predict(x_train)
y_proba_train = svc_3.predict_proba(x_train)
y_pred_test = svc_3.predict(x_test)
y_proba_test = svc_3.predict_proba(x_test)

print("TRAIN REPORT - SVC")
print(classification_report(y_train, y_pred_train))
print("TEST REPORT - SVC")
print(classification_report(y_test, y_pred_test))