In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import OrdinalEncoder


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Посмотрим на данные

### Подгрузим данные

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv', index_col=0)
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv', index_col=0)

In [None]:
train_data = train_df.iloc[:, :-1]
target = train_df.target

### Посмотрим на распределение признаков

In [None]:
column = train_df.iloc[:, 3]
sns.displot(column[column < 0.0002], kde=True)

### Разобьём данные

#### Обычный сплит

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_data, train_df.target, test_size=0.33, random_state=4, shuffle=True)

#### Кроссвалидация

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=4)

## Обучим простые модели

### Log reg

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0, C=1, max_iter=100, solver='liblinear', tol=1e-3)
log_reg.fit(X_train, y_train)
train_pred = log_reg.predict(X_train)
val_pred = log_reg.predict(X_val)

train_proba = log_reg.predict_proba(X_train)
val_proba = log_reg.predict_proba(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={log_reg.n_iter_}')

### SGD LogReg

In [None]:
%%time
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=100, tol=1e-3)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
val_pred = clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={clf.n_iter_}')

### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=4)
forest_clf.fit(X_train, y_train)

train_pred = forest_clf.predict(X_train)
val_pred = forest_clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### LightGBM

In [None]:
%%time
from lightgbm import LGBMClassifier
# define the model
model = LGBMClassifier()
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

# Попробуем шкалирование

## Отшкалируем данные

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df.iloc[:, :-1])

## Обучим простые модели

#### Обычный сплит

In [None]:
from sklearn.model_selection import train_test_split
X_train_scaled, X_val_scaled, y_train_scaled, y_val_scaled = train_test_split(train_scaled, train_df.target, test_size=0.33, random_state=4, shuffle=True)

### Log reg

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0, C=1, max_iter=7, solver='liblinear', tol=1e-3)
log_reg.fit(X_train_scaled, y_train_scaled)
train_pred = log_reg.predict(X_train_scaled)
val_pred = log_reg.predict(X_val_scaled)

train_proba = log_reg.predict_proba(X_train_scaled)
val_proba = log_reg.predict_proba(X_val_scaled)

train_acc = accuracy_score(y_train_scaled, train_pred)
val_acc = accuracy_score(y_val_scaled, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={log_reg.n_iter_}')

### SGD LogReg

In [None]:
%%time
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=300, tol=1e-3)
clf.fit(X_train_scaled, y_train_scaled)
train_pred = clf.predict(X_train_scaled)
val_pred = clf.predict(X_val_scaled)

train_acc = accuracy_score(y_train_scaled, train_pred)
val_acc = accuracy_score(y_val_scaled, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={clf.n_iter_}')

### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=4)
forest_clf.fit(X_train_scaled, y_train_scaled)

train_pred = forest_clf.predict(X_train_scaled)
val_pred = forest_clf.predict(X_val_scaled)

train_acc = accuracy_score(y_train_scaled, train_pred)
val_acc = accuracy_score(y_val_scaled, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### LightGBM

In [None]:
%%time
from lightgbm import LGBMClassifier
# define the model
model = LGBMClassifier()
model.fit(X_train_scaled, y_train_scaled)

train_pred = model.predict(X_train_scaled)
val_pred = model.predict(X_val_scaled)

train_acc = accuracy_score(y_train_scaled, train_pred)
val_acc = accuracy_score(y_val_scaled, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### Сделаем PCA

In [None]:
pca = PCA()
components = pca.fit_transform(train_scaled)
px.area(
    x=range(1, pca.singular_values_.shape[0] + 1),
    y=pca.singular_values_,
    labels={"x": "# of Component", "y": "Singular value"}
)

In [None]:
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [None]:
fig = px.scatter(components, x=0, y=1, color=train_df.target)
fig.show()

In [None]:
fig = px.scatter(components, x=0, y=1, color=train_df.target, opacity=0.1)
fig.show()

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(4),
    color=train_df.target,
    opacity=0.1
)
fig.update_traces(diagonal_visible=False)
fig.show()

## Кластеризация

### Kmeans

#### Шкалированные данные

In [None]:
from sklearn.cluster import KMeans
kmeans_scaled = KMeans(n_clusters=10, random_state=4).fit(train_scaled)
kmeans_scaled.labels_

In [None]:
fig = px.scatter(components, x=0, y=1, color=kmeans_scaled.labels_, opacity=0.1)
fig.show()

Очень грубая кластеризация

#### PCA

In [None]:
components.shape

In [None]:
kmeans_pca = KMeans(n_clusters=10, random_state=4).fit(components[:, :4])
kmeans_pca.labels_

In [None]:
fig = px.scatter(components, x=0, y=1, color=kmeans_pca.labels_, opacity=0.1)
fig.show()

Не особо лучше

## Обучим модельки

#### Обычный сплит

In [None]:
from sklearn.model_selection import train_test_split
X_train_pca, X_val_pca, y_train_pca, y_val_pca = train_test_split(components, train_df.target, test_size=0.33, random_state=4, shuffle=True)

### Log reg

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0, C=1, max_iter=7, solver='liblinear', tol=1e-3)
log_reg.fit(X_train_pca, y_train_pca)
train_pred = log_reg.predict(X_train_pca)
val_pred = log_reg.predict(X_val_pca)

train_proba = log_reg.predict_proba(X_train_pca)
val_proba = log_reg.predict_proba(X_val_pca)

train_acc = accuracy_score(y_train_pca, train_pred)
val_acc = accuracy_score(y_val_pca, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={log_reg.n_iter_}')

### SGD LogReg

In [None]:
%%time
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=100, tol=1e-3)
clf.fit(X_train_pca, y_train_pca)
train_pred = clf.predict(X_train_pca)
val_pred = clf.predict(X_val_pca)

train_acc = accuracy_score(y_train_pca, train_pred)
val_acc = accuracy_score(y_val_pca, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={clf.n_iter_}')

### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=4)
forest_clf.fit(X_train_pca, y_train_pca)

train_pred = forest_clf.predict(X_train_pca)
val_pred = forest_clf.predict(X_val_pca)

train_acc = accuracy_score(y_train_pca, train_pred)
val_acc = accuracy_score(y_val_pca, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### LightGBM

In [None]:
%%time
from lightgbm import LGBMClassifier
# define the model
model = LGBMClassifier()
model.fit(X_train_pca, y_train_pca)

train_pred = model.predict(X_train_pca)
val_pred = model.predict(X_val_pca)

train_acc = accuracy_score(y_train_pca, train_pred)
val_acc = accuracy_score(y_val_pca, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

## Попробуем уменьшить число признаков до 150

#### Обычный сплит

In [None]:
from sklearn.model_selection import train_test_split
X_train_pca, X_val_pca, y_train_pca, y_val_pca = train_test_split(components[:, :150], train_df.target, test_size=0.33, random_state=4, shuffle=True)

### Log reg

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0, C=1, max_iter=7, solver='liblinear', tol=1e-3)
log_reg.fit(X_train_pca, y_train_pca)
train_pred = log_reg.predict(X_train_pca)
val_pred = log_reg.predict(X_val_pca)

train_proba = log_reg.predict_proba(X_train_pca)
val_proba = log_reg.predict_proba(X_val_pca)

train_acc = accuracy_score(y_train_pca, train_pred)
val_acc = accuracy_score(y_val_pca, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={log_reg.n_iter_}')

### SGD LogReg

In [None]:
%%time
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=100, tol=1e-3)
clf.fit(X_train_pca, y_train_pca)
train_pred = clf.predict(X_train_pca)
val_pred = clf.predict(X_val_pca)

train_acc = accuracy_score(y_train_pca, train_pred)
val_acc = accuracy_score(y_val_pca, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={clf.n_iter_}')

### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=4)
forest_clf.fit(X_train_pca, y_train_pca)

train_pred = forest_clf.predict(X_train_pca)
val_pred = forest_clf.predict(X_val_pca)

train_acc = accuracy_score(y_train_pca, train_pred)
val_acc = accuracy_score(y_val_pca, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### LightGBM

In [None]:
%%time
from lightgbm import LGBMClassifier
# define the model
model = LGBMClassifier()
model.fit(X_train_pca, y_train_pca)

train_pred = model.predict(X_train_pca)
val_pred = model.predict(X_val_pca)

train_acc = accuracy_score(y_train_pca, train_pred)
val_acc = accuracy_score(y_val_pca, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

# Логарифмируем данные

In [None]:
train_log = np.log1p(train_data)

In [None]:
sns.displot(train_log.iloc[:, 0], kde=True)

In [None]:
sns.displot(train_log.iloc[0, :], kde=True)

#### Обычный сплит

In [None]:
from sklearn.model_selection import train_test_split
X_train_log, X_val_log, y_train_log, y_val_log = train_test_split(train_log, train_df.target, test_size=0.33, random_state=4, shuffle=True)

In [None]:
X_train, X_val, y_train, y_val = X_train_log, X_val_log, y_train_log, y_val_log

### Log reg

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0, C=1, max_iter=100, solver='liblinear', tol=1e-3)
log_reg.fit(X_train_log, y_train)
train_pred = log_reg.predict(X_train)
val_pred = log_reg.predict(X_val)

train_proba = log_reg.predict_proba(X_train)
val_proba = log_reg.predict_proba(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={log_reg.n_iter_}')

### SGD LogReg

In [None]:
%%time
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=100, tol=1e-3)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
val_pred = clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={clf.n_iter_}')

### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=4)
forest_clf.fit(X_train, y_train)

train_pred = forest_clf.predict(X_train)
val_pred = forest_clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### LightGBM

In [None]:
%%time
from lightgbm import LGBMClassifier
# define the model
model = LGBMClassifier()
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

## Сделаем PCA логарифмированых данных

### Сделаем PCA

In [None]:
pca = PCA()
components = pca.fit_transform(train_log)
px.area(
    x=range(1, pca.singular_values_.shape[0] + 1),
    y=pca.singular_values_,
    labels={"x": "# of Component", "y": "Singular value"}
)

In [None]:
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [None]:
fig = px.scatter(components, x=0, y=1, color=train_df.target)
fig.show()

In [None]:
fig = px.scatter(components, x=0, y=1, color=train_df.target, opacity=0.1)
fig.show()

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(4),
    color=train_df.target,
    opacity=0.1
)
fig.update_traces(diagonal_visible=False)
fig.show()

## Кластеризация

### Kmeans

#### Логарифмированные данные

In [None]:
from sklearn.cluster import KMeans
kmeans_scaled = KMeans(n_clusters=10, random_state=4).fit(train_log)
kmeans_scaled.labels_

In [None]:
fig = px.scatter(components, x=0, y=1, color=kmeans_scaled.labels_, opacity=0.1)
fig.show()

#### PCA

In [None]:
kmeans_pca = KMeans(n_clusters=10, random_state=4).fit(components)
kmeans_pca.labels_

In [None]:
fig = px.scatter(components, x=0, y=1, color=kmeans_pca.labels_, opacity=0.1)
fig.show()

Не особо лучше

## Обучим модельки

#### Обычный сплит

In [None]:
from sklearn.model_selection import train_test_split
X_train_log_pca, X_val_log_pca, y_train_log_pca, y_val_log_pca = train_test_split(components, train_df.target, test_size=0.33, random_state=4, shuffle=True)

In [None]:
X_train, X_val, y_train, y_val = X_train_log_pca, X_val_log_pca, y_train_log_pca, y_val_log_pca

### Log reg

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0, C=1, max_iter=7, solver='liblinear', tol=1e-3)
log_reg.fit(X_train_log, y_train)
train_pred = log_reg.predict(X_train)
val_pred = log_reg.predict(X_val)

train_proba = log_reg.predict_proba(X_train)
val_proba = log_reg.predict_proba(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={log_reg.n_iter_}')

### SGD LogReg

In [None]:
%%time
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=100, tol=1e-3)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
val_pred = clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={clf.n_iter_}')

### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=4)
forest_clf.fit(X_train, y_train)

train_pred = forest_clf.predict(X_train)
val_pred = forest_clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### LightGBM

In [None]:
%%time
from lightgbm import LGBMClassifier
# define the model
model = LGBMClassifier()
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

## Отшкалируем логарифмированные данные

In [None]:
scaler = StandardScaler()
train_log_scaled = scaler.fit_transform(train_log)

#### Обычный сплит

In [None]:
from sklearn.model_selection import train_test_split
X_train_log_pca, X_val_log_pca, y_train_log_pca, y_val_log_pca = train_test_split(train_log_scaled, train_df.target, test_size=0.33, random_state=4, shuffle=True)

In [None]:
X_train, X_val, y_train, y_val = X_train_log_pca, X_val_log_pca, y_train_log_pca, y_val_log_pca

### Log reg

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0, C=1, max_iter=7, solver='liblinear', tol=1e-3)
log_reg.fit(X_train_log, y_train)
train_pred = log_reg.predict(X_train)
val_pred = log_reg.predict(X_val)

train_proba = log_reg.predict_proba(X_train)
val_proba = log_reg.predict_proba(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={log_reg.n_iter_}')

### SGD LogReg

In [None]:
%%time
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=100, tol=1e-3)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
val_pred = clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={clf.n_iter_}')

### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=4)
forest_clf.fit(X_train, y_train)

train_pred = forest_clf.predict(X_train)
val_pred = forest_clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### LightGBM

In [None]:
%%time
from lightgbm import LGBMClassifier
# define the model
model = LGBMClassifier()
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### Сделаем PCA

In [None]:
pca = PCA()
components = pca.fit_transform(train_log_scaled)
px.area(
    x=range(1, pca.singular_values_.shape[0] + 1),
    y=pca.singular_values_,
    labels={"x": "# of Component", "y": "Singular value"}
)

In [None]:
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [None]:
fig = px.scatter(components, x=0, y=1, color=train_df.target)
fig.show()

In [None]:
fig = px.scatter(components, x=0, y=1, color=train_df.target, opacity=0.1)
fig.show()

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(4),
    color=train_df.target,
    opacity=0.1
)
fig.update_traces(diagonal_visible=False)
fig.show()

## Кластеризация

### Kmeans

#### Логарифмированные данные

In [None]:
from sklearn.cluster import KMeans
kmeans_scaled = KMeans(n_clusters=10, random_state=4).fit(train_log)
kmeans_scaled.labels_

In [None]:
fig = px.scatter(components, x=0, y=1, color=kmeans_scaled.labels_, opacity=0.1)
fig.show()

#### PCA

In [None]:
kmeans_pca = KMeans(n_clusters=10, random_state=4).fit(components)
kmeans_pca.labels_

In [None]:
fig = px.scatter(components, x=0, y=1, color=kmeans_pca.labels_, opacity=0.1)
fig.show()

Не особо лучше

## Обучим модельки

#### Обычный сплит

In [None]:
from sklearn.model_selection import train_test_split
X_train_log_pca, X_val_log_pca, y_train_log_pca, y_val_log_pca = train_test_split(components, train_df.target, test_size=0.33, random_state=4, shuffle=True)

In [None]:
X_train, X_val, y_train, y_val = X_train_log_pca, X_val_log_pca, y_train_log_pca, y_val_log_pca

### Log reg

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0, C=1, max_iter=7, solver='liblinear', tol=1e-3)
log_reg.fit(X_train_log, y_train)
train_pred = log_reg.predict(X_train)
val_pred = log_reg.predict(X_val)

train_proba = log_reg.predict_proba(X_train)
val_proba = log_reg.predict_proba(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={log_reg.n_iter_}')

### SGD LogReg

In [None]:
%%time
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=100, tol=1e-3)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
val_pred = clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}, n_iter={clf.n_iter_}')

### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=4)
forest_clf.fit(X_train, y_train)

train_pred = forest_clf.predict(X_train)
val_pred = forest_clf.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')

### LightGBM

In [None]:
%%time
from lightgbm import LGBMClassifier
# define the model
model = LGBMClassifier()
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)
print(f'train accuracy: {train_acc: .4f} \n val accuracy: {val_acc: .4f}')