<a href="https://colab.research.google.com/github/poltorashka-s-BMa/course-os-linux/blob/main/var2_Malysheva_Ekaterina_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Загрузка данных
data = pd.read_csv('data.adult.csv')

# Просмотр первых строк
print(data.head())
print("\nИнформация о данных:")
print(data.info())

   age  workclass  fnlwgt     education  education-num      marital-status  \
0   34  Local-gov  284843       HS-grad              9       Never-married   
1   40    Private  190290  Some-college             10            Divorced   
2   36  Local-gov  177858     Bachelors             13  Married-civ-spouse   
3   22    Private  184756  Some-college             10       Never-married   
4   47    Private  149700     Bachelors             13  Married-civ-spouse   

        occupation   relationship   race     sex  capital-gain  capital-loss  \
0  Farming-fishing  Not-in-family  Black    Male           594             0   
1            Sales  Not-in-family  White    Male             0             0   
2   Prof-specialty      Own-child  White    Male             0             0   
3            Sales      Own-child  White  Female             0             0   
4     Tech-support        Husband  White    Male         15024             0   

   hours-per-week >50K,<=50K  
0              60  

In [None]:
# Замена "?" на NaN и удаление пропусков
data = data.replace('?', pd.NA).dropna()

# Разделение на обучающую и тестовую выборки
train_data, test_data = train_test_split(data, test_size=0.4, random_state=42)

# Преобразование целевой переменной
train_data['income'] = train_data['>50K,<=50K'].apply(lambda x: 1 if x.strip() == '>50K' else 0)
test_data['income'] = test_data['>50K,<=50K'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

# Удаление исходной целевой переменной из данных
train_data = train_data.drop('>50K,<=50K', axis=1)
test_data = test_data.drop('>50K,<=50K', axis=1)

# One-hot encoding для категориальных признаков
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation',
                   'relationship', 'race', 'sex']
train_data = pd.get_dummies(train_data, columns=categorical_cols)
test_data = pd.get_dummies(test_data, columns=categorical_cols)

In [None]:
print("Распределение классов в обучающей выборке:")
print(train_data['income'].value_counts(normalize=True))

Распределение классов в обучающей выборке:
income
0    0.741746
1    0.258254
Name: proportion, dtype: float64


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Выделение признаков и целевой переменной
X_train = train_data.drop('income', axis=1)
y_train = train_data['income']

# Подбор оптимальной глубины дерева
param_grid = {'max_depth': range(1, 21)}
tree = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(tree, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

# Лучшие параметры
print(f"Лучшая глубина дерева: {grid_search.best_params_['max_depth']}")
print(f"Лучшая точность: {grid_search.best_score_:.3f}")

Лучшая глубина дерева: 7
Лучшая точность: 0.841


In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Подбор количества деревьев
n_estimators = [10, 50, 100, 200, 300, 400, 500]
param_grid = {'n_estimators': n_estimators}
forest = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(forest, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Лучшее количество деревьев
best_n = grid_search.best_params_['n_estimators']
print(f"Оптимальное количество деревьев: {best_n}")

# Подбор других параметров для Random Forest
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None]
}
forest = RandomForestClassifier(n_estimators=best_n, random_state=42, n_jobs=-1)
grid_search = GridSearchCV(forest, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Лучшие параметры
print("Лучшие параметры для Random Forest:")
print(grid_search.best_params_)

Оптимальное количество деревьев: 300
Лучшие параметры для Random Forest:
{'criterion': 'gini', 'max_depth': 10, 'max_features': None}


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Подготовка тестовых данных
X_test = test_data.drop('income', axis=1)
y_test = test_data['income']

# Лучшие модели
best_tree = DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth'], random_state=42)
best_forest = RandomForestClassifier(n_estimators=best_n, **grid_search.best_params_, random_state=42, n_jobs=-1)

# Обучение и предсказание
best_tree.fit(X_train, y_train)
best_forest.fit(X_train, y_train)

tree_pred = best_tree.predict(X_test)
forest_pred = best_forest.predict(X_test)

# Оценка качества
print("Decision Tree:")
print(classification_report(y_test, tree_pred))
print("Accuracy:", accuracy_score(y_test, tree_pred))

print("\nRandom Forest:")
print(classification_report(y_test, forest_pred))
print("Accuracy:", accuracy_score(y_test, forest_pred))


Decision Tree:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      4577
           1       0.70      0.62      0.66      1562

    accuracy                           0.84      6139
   macro avg       0.79      0.77      0.78      6139
weighted avg       0.83      0.84      0.83      6139

Accuracy: 0.8359667698322202

Random Forest:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      4577
           1       0.79      0.58      0.67      1562

    accuracy                           0.85      6139
   macro avg       0.83      0.76      0.79      6139
weighted avg       0.85      0.85      0.85      6139

Accuracy: 0.8545365694738557


In [None]:
pip install ete3

Collecting ete3
  Downloading ete3-3.1.3.tar.gz (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ete3
  Building wheel for ete3 (setup.py) ... [?25l[?25hdone
  Created wheel for ete3: filename=ete3-3.1.3-py3-none-any.whl size=2273786 sha256=ce3a818c0cf50613061beafb234841af8a2978e1e2f2dd9a4644e62ba52b6315
  Stored in directory: /root/.cache/pip/wheels/dd/a8/60/0a29caa9f8ceb7316704be63c1578ab13c36668abb646366ac
Successfully built ete3
Installing collected packages: ete3
Successfully installed ete3-3.1.3


In [None]:
from ete3 import Tree, TreeStyle

tree = Tree("/content/alig_genome.tree")  # Загрузка Newick
ts = TreeStyle()
ts.show_leaf_name = True
ts.show_branch_support = True  # Показывать bootstrap

tree.render("tree.png", w=800, tree_style=ts)  # PNG
# tree.show()  # Интерактивный просмотр

ImportError: cannot import name 'TreeStyle' from 'ete3' (/usr/local/lib/python3.11/dist-packages/ete3/__init__.py)