In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Load processed dataset with engineered binary features
df_train = pd.read_csv('../data/prepared/train_fully_prepared.csv')
# Train/test split (или кросс-валидация)
X = df_train.drop(columns=['id', 'Personality'])
y = LabelEncoder().fit_transform(df_train['Personality'])

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X, y)

# Predict probabilities (on the same train - to select the threshold)
probs = model.predict_proba(X)[:, 1]  # probability of class 1 (Extrovert)


In [2]:
thresholds = np.arange(0.1, 0.9, 0.01)
best_acc = 0
best_threshold = 0.5

for t in thresholds:
    preds = (probs > t).astype(int)
    acc = accuracy_score(y, preds)
    if acc > best_acc:
        best_acc = acc
        best_threshold = t

print(f"✅ Best threshold: {best_threshold:.2f} → Accuracy: {best_acc:.4f}")


✅ Best threshold: 0.53 → Accuracy: 0.9690


In [4]:
import joblib
from sklearn.preprocessing import LabelEncoder

# Load processed test set
df_test = pd.read_csv('../data/prepared/test_fully_prepared.csv')
X_test = df_test.drop(columns=["id"])

# Load model
model = joblib.load("../model_logreg/model.pkl")

# Predict probabilities
probs = model.predict_proba(X_test)[:, 1]  # probability Extrovert

# Apply threshold
threshold = 0.53
preds = (probs > threshold).astype(int)

# Inverse transform back to labels
label_encoder = LabelEncoder()
label_encoder.fit(["Introvert", "Extrovert"])  
pred_labels = label_encoder.inverse_transform(preds)

# Save submission
submission = pd.DataFrame({
    "id": df_test["id"],
    "Personality": pred_labels
})

submission.to_csv("submission.csv", index=False)
print("✅ Submission with threshold 0.53 saved.")


✅ Submission with threshold 0.53 saved.
