In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Загрузка данных
df = pd.read_csv('/kaggle/input/churnsq/Churn (1).csv')

# Проверка наличия пропусков
print(df.isnull().sum())

RowNumber          0
CustomerId         0
Surname            0
Score              0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
IsActiveMember     0
EstimatedSalary    1
Exited             0
dtype: int64


In [12]:
# Заполнение пропущенного значения в 'EstimatedSalary' медианой
imputer = SimpleImputer(strategy='median')
df['EstimatedSalary'] = imputer.fit_transform(df[['EstimatedSalary']])

In [13]:
# Кодирование категориальных признаков
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Geography'] = label_encoder.fit_transform(df['Geography'])

# Определение признаков и целевой переменной
X = df[['Score', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'IsActiveMember', 'EstimatedSalary']]
y = df['Exited']

# Нормализация данных
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Разделение данных на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [14]:
from sklearn.ensemble import RandomForestClassifier

# Обучение модели случайного леса
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Предсказания и оценка
y_pred_rf = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Accuracy: 0.8625
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.74      0.46      0.57       393

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.74      2000
weighted avg       0.85      0.86      0.85      2000



In [15]:
from sklearn.model_selection import GridSearchCV

# Пример для случайного леса
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Лучшая модель
print("Лучшие параметры:", grid_search.best_params_)
print("Лучшая точность:", grid_search.best_score_)


Лучшие параметры: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}
Лучшая точность: 0.8622495893939739
