In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import random
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import *
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from math import floor
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, classification_report
from imblearn.over_sampling import SMOTE

In [190]:
train = pd.read_csv("C:/Users/subin/OneDrive - g.skku.edu/문서/P-SAT/2023 겨울방학세미나/train.csv/train.csv")
test = pd.read_csv("C:/Users/subin/OneDrive - g.skku.edu/문서/P-SAT/2023 겨울방학세미나/test.csv/test.csv")
submission = pd.read_csv("C:/Users/subin/OneDrive - g.skku.edu/문서/P-SAT/2023 겨울방학세미나/sample_submission.csv")

In [191]:
X = train.drop(columns=['ID_code', 'target'], axis=1)
y = train['target']

In [192]:
test_df = test.drop(columns=['ID_code'])

In [193]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

**SMOTE**

In [194]:
smote = SMOTE(sampling_strategy='minority', random_state=0)
X_train_over, y_train_over = smote.fit_resample(X_train,y_train)
X_test_over, y_test_over = smote.fit_resample(X_test,y_test)

In [195]:
print('SMOTE 적용 전 학습용 피처/레이블 데이터 세트: ', X_train.shape, y_train.shape)
print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트: ', X_train_over.shape, y_train_over.shape)
print('SMOTE 적용 전 값의 분포 :\n',pd.Series(y_train).value_counts())
print('SMOTE 적용 후 값의 분포 :\n',pd.Series(y_train_over).value_counts())

SMOTE 적용 전 학습용 피처/레이블 데이터 세트:  (112000, 200) (112000,)
SMOTE 적용 후 학습용 피처/레이블 데이터 세트:  (201616, 200) (201616,)
SMOTE 적용 전 값의 분포 :
 0    100808
1     11192
Name: target, dtype: int64
SMOTE 적용 후 값의 분포 :
 0    100808
1    100808
Name: target, dtype: int64


In [196]:
X_train = X_train_over.copy()
X_test = X_test_over.copy()
y_train = y_train_over.copy()
y_test = y_test_over.copy()

**Feature Scaling**

In [197]:
cols = X_train.columns

In [198]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_over)
X_test = scaler.transform(X_test)

In [199]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [200]:
X_test = pd.DataFrame(X_test, columns=[cols])

Logistic Regression (Halving Grid Search CV)

In [34]:
solvers = ['saga']
penalty = ['l1', 'l2', 'none']
c_values = [100,10,1.0,0.1,0.01]

In [35]:
model = LogisticRegression()
grid = dict(solver=solvers, penalty=penalty, C=c_values)
cv = KFold(n_splits = 10)
grid_search = HalvingGridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1', verbose=10)
grid_result = grid_search.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 25586
max_resources_: 230276
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 15
n_resources: 25586
Fitting 10 folds for each of 15 candidates, totalling 150 fits
----------
iter: 1
n_candidates: 5
n_resources: 76758
Fitting 10 folds for each of 5 candidates, totalling 50 fits
----------
iter: 2
n_candidates: 2
n_resources: 230274
Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [36]:
print(grid_result.best_params_)

{'C': 1.0, 'penalty': 'none', 'solver': 'saga'}


In [118]:
log_model = LogisticRegression(C=1.0, penalty='none', solver='saga')
log_model.fit(X_train, y_train)

LogisticRegression(penalty='none', solver='saga')

In [119]:
y_pred = log_model.predict(X_test)

In [120]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.79      0.87     43262
           1       0.28      0.77      0.41      4738

    accuracy                           0.78     48000
   macro avg       0.63      0.78      0.64     48000
weighted avg       0.90      0.78      0.82     48000



In [121]:
print(f1_score(y_test, y_pred))

0.41402394054575364


Logistic Regression

In [201]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

LogisticRegression()

In [202]:
y_pred = log_model.predict(X_test)

In [203]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.79      0.79     43114
           1       0.79      0.80      0.80     43114

    accuracy                           0.80     86228
   macro avg       0.80      0.80      0.80     86228
weighted avg       0.80      0.80      0.80     86228



In [205]:
print(f1_score(y_test, y_pred))

0.7964366900912752


Logistic Regression with Lasso penalty

In [122]:
log_clf = LogisticRegression(penalty= 'l1', solver= 'saga', random_state=42)

In [123]:
log_clf.fit(X_train, y_train)

LogisticRegression(penalty='l1', random_state=42, solver='saga')

In [124]:
y_pred = log_clf.predict(X_test)

In [125]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.79      0.87     43262
           1       0.28      0.77      0.41      4738

    accuracy                           0.78     48000
   macro avg       0.63      0.78      0.64     48000
weighted avg       0.90      0.78      0.82     48000



In [126]:
print(f1_score(y_test, y_pred))

0.41397696976572695


In [60]:
predict_log = log.predict(test_df)

In [61]:
submission = pd.DataFrame({"ID_code":submission['ID_code'],
                         "target":predict_log})

In [66]:
submission.to_csv('submission_log.csv',index=False)