In [1]:
# Importowanie potrzebnych pakietów:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, scale, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score, roc_curve, f1_score, roc_auc_score, classification_report
from xgboost.sklearn import XGBClassifier
from scipy.stats import norm
from scipy import stats
from sklearn.svm import SVC

from warnings import filterwarnings
filterwarnings("ignore")

# Ustawienia wyświetlania data frame'ów:
pd.set_option('display.max_columns', 50) # żeby nie ucinało kolumn przy wyświetlaniu
pd.set_option('float_format', '{:.2f}'.format) # floaty mają się zaogrąglać do dwóch miejść po przecinku

In [2]:
URL = 'https://raw.githubusercontent.com/saimadhu-polamuri/DataHakthon3X/master/dataSet/Train.csv'

def extract(url: str) -> pd.DataFrame:
    if url[:8] == 'https://' and url[-4:] == '.csv':
        return pd.read_csv(url, encoding='latin1', skipinitialspace=True)
    else:
        raise UnsupportedFormatError

def split_data(data: pd.DataFrame) -> pd.DataFrame:
    y = data.Disbursed
    X = data.drop(['Disbursed'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    return X_train, X_test, y_train, y_test

def print_df_description(df: pd.DataFrame) -> pd.DataFrame:
    df_size = df.shape[0]
    col_types = [df[column].dtype for column in df.columns]
    count_nan = [df[column].isna().sum(axis=0) for column in df.columns]
    nan_percentage = [n / df_size * 100 for n in count_nan]
    count_zeros = [(df.shape[0] - len(df[column].nonzero()[0])) for column in df.columns]
    zeros_percentage = [z / df_size * 100 for z in count_zeros]
    unique_values_num = [np.unique(df[column].astype(str)).size for column in df.columns]
    unique_values = [np.unique(df[column].astype(str)) for column in df.columns]
    df_describe = pd.DataFrame({
        'Column_names': df.columns, 'Column_types': col_types, 
        'Num_of_NaN': count_nan, 'NaN_%': nan_percentage, 
        'Num_of_zeros': count_zeros, 'Zeros_%': zeros_percentage, 
        'Unique_val_amt': unique_values_num, 'Unique_values': unique_values})
    return df_describe

In [3]:
data = extract(URL) # wczytanie danych z linku poprzez funkcję własną
X_train, X_test, y_train, y_test = split_data(data) # podział danych na zbiór treningowy i testowy

In [4]:
def get_unique_values(column: pd.DataFrame) -> dict:
    """
    Funkcja, która dla danej zmiennej w postaci pd.DataFrame tworzy słownik unikalnych wartości w taki sposób, 
    że kluczami będą kolejne liczby, zaś wartościami kolejne unikalne wartości zmiennej.
    Przykład.
    Dla zmiennej określającej płeć (Female / Male) wynik będzie następujący: {0: 'Female', 1: 'Male'}
    """
    unique_values = np.unique(column)
    wynik = {}
    for val in range(unique_values.size):
        wynik[val] = [unique_values[val]]
    return wynik

def get_groups(groups: dict) -> dict:
    """
    Funkcja, która stworzy schemat, dzięki któremu zmienna w pd.DataFrame będzie mogła zostać "zmapowana".
    Schemat tworzony jest na podstawie słownika, będącego wynikiem funkcji `get_unique_values`. 
    """
    wynik = {}
    for key, value in groups.items():
        for name in value:
            wynik[name] = int(key)
    return wynik

In [5]:
# schematy do mapowania zmiennych kategorycznych
gender_schema = get_groups(get_unique_values(X_train['Gender']))
mobile_verified_schema = get_groups(get_unique_values(X_train['Mobile_Verified']))
var1_schema = get_groups(get_unique_values(X_train['Var1']))
filled_form_schema = get_groups(get_unique_values(X_train['Filled_Form']))
device_type_schema = get_groups(get_unique_values(X_train['Device_Type']))
var2_schema = get_groups(get_unique_values(X_train['Var2']))

# tworzę ręcznie słownik dla zmapowania miesięcy, które w zmiennej są wpisane jako pierwsze 3 litery miesiąca
MONTH_GROUPS = {1: ['Jan'], 2: ['Feb'], 3: ['Mar'], 4: ['Apr'], 5: ['May'], 6: ['Jun'], 
                7: ['Jul'], 8: ['Aug'], 9: ['Sep'], 10: ['Oct'], 11: ['Nov'], 12: ['Dec'],}
months_schema = get_groups(MONTH_GROUPS)

In [6]:
def transform(df: pd.DataFrame) -> pd.DataFrame:
    YEAR = datetime.datetime.now().year

    df['Gender'].replace(gender_schema, inplace=True)
    df['Mobile_Verified'].replace(mobile_verified_schema, inplace=True)
    df['Var1'].replace(var1_schema, inplace=True)
    df['Filled_Form'].replace(filled_form_schema, inplace=True)
    df['Device_Type'].replace(device_type_schema, inplace=True)
    df['Var2'].replace(var2_schema, inplace=True)

    df.DOB = df.DOB.apply(lambda x: YEAR - int('19' + str(x[-2:])))

    df.Lead_Creation_Date = df.Lead_Creation_Date.apply(lambda x: (x[3:-3]))
    df['Lead_Creation_Date'].replace(months_schema, inplace=True)
    df.Lead_Creation_Date = df.Lead_Creation_Date.astype(int)

    df.Source = df.Source.apply(lambda x: x[1:]).astype(int)
    
    df.City = df.City.astype(str)
    df.City = df.City.apply(lambda x: x.upper()).astype(str)

    df.Employer_Name = df.Employer_Name.astype(str)
    df.Employer_Name = df.Employer_Name.apply(lambda x: x.upper())

    df.Salary_Account = df.Salary_Account.astype(str)
    df.Salary_Account = df.Salary_Account.apply(lambda x: x.upper())

    df.drop(['ID', 'LoggedIn'], axis=1, inplace=True)
    
    return df

In [7]:
# obrabiam wstępnie dane treningowe oraz wyświetlam opis danych
transform(X_train)
# print_df_description(X_train)

Unnamed: 0,Gender,City,Monthly_Income,DOB,Lead_Creation_Date,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Employer_Name,Salary_Account,Mobile_Verified,Var5,Var1,Loan_Amount_Submitted,Loan_Tenure_Submitted,Interest_Rate,Processing_Fee,EMI_Loan_Submitted,Filled_Form,Device_Type,Var2,Source,Var4
83041,1,CHENNAI,210000,36,7,200000.00,0.00,0.00,ALKEM LABORATORIES LTD,STATE BANK OF INDIA,1,17,13,200000.00,4.00,,,,0,1,6,122,3
34474,0,DELHI,10500,25,6,100000.00,2.00,3500.00,PAISLEY EXPORT PVT LTD,NAN,0,0,13,,,,,,0,1,1,133,1
48453,1,MUMBAI,48000,29,6,0.00,0.00,0.00,BNP PARIBAS INDIA SOLUTIONS PVT LTD,AXIS BANK,1,8,2,730000.00,4.00,14.85,14600.00,20260.98,1,0,2,122,5
27735,1,DELHI,23243,30,6,0.00,0.00,0.00,EXL SERVICE.COM INDIA PVT LTD,ICICI BANK,1,3,1,360000.00,5.00,16.75,7200.00,8898.61,1,0,2,133,5
82812,1,MUMBAI,129500,47,7,1000000.00,5.00,38000.00,TYPE SLOWLY FOR AUTO FILL,ICICI BANK,1,13,13,1000000.00,5.00,,,,0,1,6,122,3
41151,0,INDORE,38500,40,6,300000.00,5.00,18600.00,STI INDIA LTD,AXIS BANK,0,0,13,,,,,,0,1,1,133,1
23024,0,NAGPUR,14500,40,5,100000.00,3.00,0.00,AARPEE ENTERPRISES,NAN,0,0,13,,,,,,0,1,1,133,1
83206,0,HYDERABAD,19000,27,7,0.00,0.00,0.00,TECH MAHINDRA LTD,ICICI BANK,1,2,13,320000.00,4.00,,,,0,0,6,122,3
45644,0,MUMBAI,35000,37,6,300000.00,1.00,20000.00,INGRAM MICRO INDIA LTD,HDFC BANK,0,0,13,,,,,,0,1,1,133,1
62154,0,MUMBAI,12000,28,7,70000.00,1.00,5000.00,SUBHASH BHAGATE,ICICI BANK,0,0,13,,,,,,0,1,6,122,1


In [8]:
# tworzę listy najczęściej występujących miast i banków
df_cities = pd.DataFrame(X_train.City.value_counts())
often_cities_list = (df_cities[(df_cities['City'] > 250)].index)

df_salary_accounts = pd.DataFrame(X_train.Salary_Account.value_counts())
often_banks_list = (df_salary_accounts[(df_salary_accounts['Salary_Account'] > 250)].index)

In [9]:
# obsługa outlier'ów - w pierwszej kolejności wypiszę rozkłądy ćwiartkowe i podstawowe statystyki
outliers = ['Monthly_Income', 'Loan_Amount_Applied', 'Existing_EMI', 'Loan_Amount_Submitted', 
            'Interest_Rate', 'Processing_Fee', 'EMI_Loan_Submitted']
Q1 = X_train.quantile(0.25)
Q2 = X_train.quantile(0.5)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1

In [10]:
mins = np.min(X_train) # minimalne wartości zmiennych
maxs = np.max(X_train) # maksymalne -||-
means = np.mean(X_train) # średnie -||-
left = Q1 - 1.5 * IQR # wyliczam lewy brzeg zakresu, który należy uznać za normalny dla zmiennych
right = Q3 + 1.5 * IQR # analogicznie wyliczam prawy brzeg
std = np.std(X_train) # odchylenie standardowe

In [11]:
# wyświetlam powyższe wyliczenia dla zmiennych z outlier'ami
pd.DataFrame({'Q1': Q1[outliers], 'Q2': Q2[outliers], 'Q3': Q3[outliers], 'IQR': IQR[outliers], 
              'min': mins[outliers], 'max': maxs[outliers], 'std': std[outliers], 'mean': means[outliers],
              'left (Q1 - 1.5x IQR)': left[outliers], 'right (Q3 + 1.5x IQR)': right[outliers]})

Unnamed: 0,Q1,Q2,Q3,IQR,min,max,std,mean,left (Q1 - 1.5x IQR),right (Q3 + 1.5x IQR)
Monthly_Income,16500.0,25000.0,40000.0,23500.0,0.0,444554443.0,2428931.76,62512.21,-18750.0,75250.0
Loan_Amount_Applied,0.0,100000.0,300000.0,300000.0,0.0,10000000.0,355472.19,230043.25,-450000.0,750000.0
Existing_EMI,0.0,0.0,3500.0,3500.0,0.0,10000000.0,44280.53,3740.95,-5250.0,8750.0
Loan_Amount_Submitted,200000.0,300000.0,500000.0,300000.0,50000.0,3000000.0,308807.21,395413.81,-250000.0,950000.0
Interest_Rate,15.25,18.0,20.0,4.75,11.99,37.0,5.84,19.2,8.12,27.12
Processing_Fee,2000.0,4000.0,6250.0,4250.0,200.0,50000.0,4752.86,5155.66,-4375.0,12625.0
EMI_Loan_Submitted,6492.52,9431.45,12970.28,6477.76,1176.41,144748.28,7578.72,11028.62,-3224.12,22686.92


In [12]:
# tworzę granice "normalności", które przyjmę dla zmiennych odstających
right_MonInc = right['Monthly_Income']
right_LoanAmtApp = right['Loan_Amount_Applied']
right_ExEMI = right['Existing_EMI']
right_LoanAmtSub = right['Loan_Amount_Submitted']
right_IntRate = right['Interest_Rate']
left_IntRate = left['Interest_Rate']
right_ProcFee = right['Processing_Fee']
right_EMISub = right['EMI_Loan_Submitted']

In [13]:
# grupy do zmapowania pozostałych zmiennych kategorycznych na numeryczne wg wcześniej ustalonej listy unikatów
city_schema = get_groups(get_unique_values(often_cities_list))
city_schema['OTHERS'] = 30
salary_schema = get_groups(get_unique_values(often_banks_list))
salary_schema['OTHERS'] = 30

#### **Tutaj dla braków danych zastosuję średnią zamiast zer oraz usunę zmienne, o których niewiele wiemy (np. Var1).**

In [14]:
def transform1(df: pd.DataFrame) -> pd.DataFrame:
    df.City = df.City.apply(lambda x: x if x in often_cities_list else 'OTHERS')
    df['City'].replace(city_schema, inplace=True)

    df.Salary_Account = df.Salary_Account.apply(lambda x: x if x in often_banks_list else 'OTHERS')
    df['Salary_Account'].replace(salary_schema, inplace=True)
    
    df.Interest_Rate.fillna(means.Interest_Rate, inplace=True) # zmienna jest obsługiwana poniżej, ale uzupełnienie 0 nie działało
    df.Loan_Tenure_Applied.fillna(means.Loan_Tenure_Applied, inplace=True)
    df.Loan_Tenure_Submitted.fillna(means.Loan_Tenure_Submitted, inplace=True)

    df.Monthly_Income = df.Monthly_Income.apply(
        lambda x: means.Monthly_Income if x == np.nan else x if x < right_MonInc else right_MonInc)
    df.Loan_Amount_Applied = df.Loan_Amount_Applied.apply(
        lambda x: means.Loan_Amount_Applied if x == np.nan else x if x < right_LoanAmtApp else right_LoanAmtApp)
    df.Existing_EMI = df.Existing_EMI.apply(
        lambda x: means.Existing_EMI if x == np.nan else x if x < right_ExEMI else right_ExEMI)
    df.Loan_Amount_Submitted = df.Loan_Amount_Submitted.apply(
        lambda x: means.Loan_Amount_Submitted if x == np.nan else x if x < right_LoanAmtSub else right_LoanAmtSub)
    df.Interest_Rate = df.Interest_Rate.apply(
        lambda x: right_IntRate if x > right_IntRate else left_IntRate if x < left_IntRate else x)
    df.Processing_Fee = df.Processing_Fee.apply(
        lambda x: means.Processing_Fee if x == np.nan else x if x < right_ProcFee else right_ProcFee)
    df.EMI_Loan_Submitted = df.EMI_Loan_Submitted.apply(
        lambda x: means.EMI_Loan_Submitted if x == np.nan else x if x < right_EMISub else right_EMISub)
    
    df.drop(['Employer_Name', 'Var5', 'Var1', 'Var2', 'Var4', 'Source'], axis=1, inplace=True)

    return df

In [15]:
# dokonuję finalnej transformacji danych
transform1(X_train)
# print_df_description(X_train)

Unnamed: 0,Gender,City,Monthly_Income,DOB,Lead_Creation_Date,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Salary_Account,Mobile_Verified,Loan_Amount_Submitted,Loan_Tenure_Submitted,Interest_Rate,Processing_Fee,EMI_Loan_Submitted,Filled_Form,Device_Type
83041,1,4,75250.00,36,7,200000.00,0.00,0.00,25,1,200000.00,4.00,19.20,12625.00,22686.92,0,1
34474,0,7,10500.00,25,6,100000.00,2.00,3500.00,19,0,950000.00,3.89,19.20,12625.00,22686.92,0,1
48453,1,19,48000.00,29,6,0.00,0.00,0.00,1,1,730000.00,4.00,14.85,12625.00,20260.98,1,0
27735,1,7,23243.00,30,6,0.00,0.00,0.00,11,1,360000.00,5.00,16.75,7200.00,8898.61,1,0
82812,1,19,75250.00,47,7,750000.00,5.00,8750.00,11,1,950000.00,5.00,19.20,12625.00,22686.92,0,1
41151,0,13,38500.00,40,6,300000.00,5.00,8750.00,1,0,950000.00,3.89,19.20,12625.00,22686.92,0,1
23024,0,20,14500.00,40,5,100000.00,3.00,0.00,19,0,950000.00,3.89,19.20,12625.00,22686.92,0,1
83206,0,12,19000.00,27,7,0.00,0.00,0.00,11,1,320000.00,4.00,19.20,12625.00,22686.92,0,0
45644,0,19,35000.00,37,6,300000.00,1.00,8750.00,9,0,950000.00,3.89,19.20,12625.00,22686.92,0,1
62154,0,19,12000.00,28,7,70000.00,1.00,5000.00,11,0,950000.00,3.89,19.20,12625.00,22686.92,0,1


In [16]:
# analogicznie transformuję dane testowe
transform(X_test)
transform1(X_test)
# print_df_description(X_test)

Unnamed: 0,Gender,City,Monthly_Income,DOB,Lead_Creation_Date,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Salary_Account,Mobile_Verified,Loan_Amount_Submitted,Loan_Tenure_Submitted,Interest_Rate,Processing_Fee,EMI_Loan_Submitted,Filled_Form,Device_Type
66075,1,1,23000.00,27,7,200000.00,3.00,4000.00,11,1,200000.00,3.00,19.20,12625.00,22686.92,0,1
23787,0,1,75250.00,36,5,750000.00,5.00,8750.00,7,0,950000.00,3.89,19.20,12625.00,22686.92,0,1
82963,0,12,18000.00,30,7,300000.00,5.00,0.00,2,0,950000.00,3.89,19.20,12625.00,22686.92,0,1
65127,1,8,75250.00,35,7,200000.00,3.00,0.00,18,1,200000.00,3.00,19.20,12625.00,22686.92,0,1
25243,0,11,30000.00,24,5,0.00,0.00,0.00,19,0,950000.00,3.89,19.20,12625.00,22686.92,0,0
20845,0,19,19800.00,32,5,50000.00,2.00,0.00,19,0,950000.00,3.89,19.20,12625.00,22686.92,0,1
61037,0,16,10000.00,34,7,0.00,0.00,0.00,19,0,950000.00,3.89,19.20,12625.00,22686.92,0,1
84264,1,30,37000.00,30,7,200000.00,5.00,0.00,0,1,200000.00,5.00,19.20,12625.00,22686.92,0,1
19047,0,7,17000.00,44,5,150000.00,3.00,5000.00,20,0,950000.00,3.89,19.20,12625.00,22686.92,0,1
2651,0,1,28000.00,29,5,300000.00,2.00,0.00,8,0,950000.00,3.89,19.20,12625.00,22686.92,0,1


In [19]:
# tworzę klasę do selekcji zmiennych wg nazw
class ColumnSelectorByName(TransformerMixin, BaseEstimator):
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

# wypisuję listy zmiennych, które oryginalnie były kategoryczne i użyję na nich 
# OneHotEncoder'a, a także zmienne numeryczne, które zestandaryzuję
originally_categorical_features = ['Gender', 'City', 'DOB', 'Lead_Creation_Date', 
                                   'Salary_Account', 'Mobile_Verified', 'Filled_Form', 'Device_Type']
numerical_features = ['Monthly_Income', 'Loan_Amount_Applied', 'Loan_Tenure_Applied', 
                      'Existing_EMI', 'Interest_Rate', 'Loan_Amount_Submitted', 'Loan_Tenure_Submitted', 
                      'Processing_Fee', 'EMI_Loan_Submitted']
    
# tworzę pipeline'y, które użyję do transformatora
categorical_pipeline = Pipeline([
    ('selector', ColumnSelectorByName(originally_categorical_features)),
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore')),
])

numeric_pipeline = Pipeline([
    ('selector', ColumnSelectorByName(numerical_features)),
    ('scaler', StandardScaler()),
])

# transformator - preprocessor, który agreguje powyższe pipeline'y
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, originally_categorical_features),
    ]
)

In [20]:
# tworzę np.array mapujący wagi dla y_train - niektóre modele (np. XGBoost) nie przujmuje wag jako słowników
df_weights = y_train.copy()
weight_0 = 0.5
weight_1 = 10
df_weights = df_weights.apply(lambda x: weight_0 if x == 0 else weight_1)
class_weights0510 = df_weights.values

In [21]:
# analogicznię robię listę dla innych wag, 0.5 / 5:
df_weights = y_train.copy()
weight_0 = 0.5
weight_1 = 5
df_weights = df_weights.apply(lambda x: weight_0 if x == 0 else weight_1)
class_weights055 = df_weights.values

# Zagregowany GridSearch

In [29]:
names_agg = ['LogisticRegression', 'XGBoost', 'RandomForestClassifier',]

models_agg = [[('scaler', preprocessor), ('model', LogisticRegression())], 
              [('model', XGBClassifier())], 
              [('model', RandomForestClassifier())]]

param_grids = [
    {'model__C': [0.1, 0.01, 0.001], 
     'model__class_weight': [{0: 0.5, 1: 5}, {0: 0.5, 1:10}],
     'model__penalty': ['l1', 'l2'],
     'model__tol': [0.1, 0.01, 0.001],
    },
    {'model__learning_rate': [0.1, 0.01], 
     'model__max_depth': [50, 75, 100],
     'model__n_estimators': [50, 75],
    },
    {'model__class_weight': [{0: 0.5, 1: 5}, {0: 0.5, 1: 10}],
     'model__max_depth': [20, 50, 75, 100],
     'model__min_samples_leaf': [400, 500, 750],
     'model__n_estimators': [20, 50, 75, 100],
    },
]           

best_models_agg = []
best_params_agg = []

for name, pipe, params in zip(names_agg, models_agg, param_grids):
    print("---!!!---", name)
    pipeline = Pipeline(pipe)
    if name == 'XGBoost':
        gs = GridSearchCV(estimator=pipeline, param_grid=params, refit=True, return_train_score=True, 
                          fit_params={'model__sample_weight': class_weights0510}, scoring='f1')
    else:
        gs = GridSearchCV(estimator=pipeline, param_grid=params, refit=True, return_train_score=True, 
                          scoring='f1')
    gs.fit(X_train, y_train)
    
    for mean, std, parametry, fittime in zip(
                            gs.cv_results_["mean_test_score"],
                            gs.cv_results_["std_test_score"],
                            gs.cv_results_["params"],
                            gs.cv_results_["mean_fit_time"]):
        print(f'mean = {(np.round(mean, 5))} || std = {(np.round(std, 5))} || time [s] = {np.round(fittime, 5)}')
        print(f'Best params: {parametry}')
    print()
    
    best_models_agg.append(gs.best_estimator_) # zapisujemy model z najlepszymi parametrami
    best_params_agg.append(gs.best_params_) # oraz najlepsze wyniki

---!!!--- LogisticRegression
mean = 0.09092 || std = 0.00872 || time [s] = 0.36314
Best params: {'model__C': 0.1, 'model__class_weight': {0: 0.5, 1: 5}, 'model__penalty': 'l1', 'model__tol': 0.1}
mean = 0.08957 || std = 0.00758 || time [s] = 1.34918
Best params: {'model__C': 0.1, 'model__class_weight': {0: 0.5, 1: 5}, 'model__penalty': 'l1', 'model__tol': 0.01}
mean = 0.0898 || std = 0.00763 || time [s] = 2.21783
Best params: {'model__C': 0.1, 'model__class_weight': {0: 0.5, 1: 5}, 'model__penalty': 'l1', 'model__tol': 0.001}
mean = 0.09237 || std = 0.0088 || time [s] = 0.42486
Best params: {'model__C': 0.1, 'model__class_weight': {0: 0.5, 1: 5}, 'model__penalty': 'l2', 'model__tol': 0.1}
mean = 0.09145 || std = 0.00748 || time [s] = 0.45765
Best params: {'model__C': 0.1, 'model__class_weight': {0: 0.5, 1: 5}, 'model__penalty': 'l2', 'model__tol': 0.01}
mean = 0.09145 || std = 0.00748 || time [s] = 0.57247
Best params: {'model__C': 0.1, 'model__class_weight': {0: 0.5, 1: 5}, 'model__pe

In [30]:
acc_agg = []
dop_agg = []
f1_agg = []
for name, best_model, best_param in zip(names_agg, best_models_agg, best_params_agg):
    acc_agg.append(accuracy_score(best_model.predict(X_test), y_test)*100)
    dop_agg.append(accuracy_score(best_model.predict(X_train), y_train)*100)
    f1_agg.append(f1_score(y_true=y_test, y_pred=best_model.predict(X_test))*100)

wyniki1 = pd.DataFrame({
    'Model': names_agg, 'Jakość predykcji': acc_agg, 'Dopasowanie': dop_agg, 
    'F1': f1_agg, 'Najlepsze parametry': best_params_agg
})
wyniki1

Unnamed: 0,Model,Jakość predykcji,Dopasowanie,F1,Najlepsze parametry
0,LogisticRegression,92.05,92.03,10.6,"{'model__C': 0.1, 'model__class_weight': {0: 0.5, 1: 10}, 'model__penalty': 'l2', 'model__tol': 0.1}"
1,XGBoost,95.95,98.59,7.84,"{'model__learning_rate': 0.01, 'model__max_depth': 50, 'model__n_estimators': 75}"
2,RandomForestClassifier,95.4,95.35,13.23,"{'model__class_weight': {0: 0.5, 1: 10}, 'model__max_depth': 100, 'model__min_samples_leaf': 400, 'model__n_estimators': 50}"


**Podobnie jak w jupyterze 'full' najlepszy wynik dał las losowy, jednak wynik wyszedł gorszy - przy czym tutaj zastosowano zarówno wypełnienie braków w danych średnimi (tam: zerami), ponadto usunięto zmienne var, które nie były dobrze wyjaśnione w opisie danych. Można zatem uznać, że rozwiązanie z poprzedniego jupytera jest lepsze.**