In [None]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import yaml

In [None]:
#load config
# Open the config file and load the options
options_path = 'config/config.yaml'
with open(options_path, 'r') as option_file:
    options = yaml.safe_load(option_file)

In [None]:
data_train = pd.read_csv(options['train_path'])
data_test = pd.read_csv(options['test_path'])

y_train = data_train['target']
X_train = data_train.drop(columns='target')

# Первичный обзор данных

In [None]:
print(data_train.head())
print(data_train.info())
print(data_train.describe())
print(data_train.isnull().sum())

# Визуализация данных

In [None]:
def draw_hist_and_boxplot(column):
    figure, ax = plt.subplots(1, 2, figsize=(15,5))
    data_train[column].plot(kind='hist', bins=50, ax=ax[0])
    data_train[column].plot(kind='box', ax=ax[1])
    plt.show()

In [None]:
draw_hist_and_boxplot('')

# Работа с выбросами и аномалиями

In [None]:
def count_outliers(column_name):
    # Выбираем столбец данных
    column = data_train[column_name]
    
    # Вычисляем межквартильное расстояние (IQR)
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    
    # Определяем границы выбросов
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Считаем количество выбросов
    outliers_count = len(column[(column < lower_bound) | (column > upper_bound)])
    
    return outliers_count

In [None]:
count_outliers('')

# Корреляционный анализ

In [None]:
correlation_matrix = data_train.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Работа с категориальными признаками

In [None]:
def plot_categorical_feature(feature_name):
    sns.countplot(data=data_train, x=feature_name)
    plt.xlabel(feature_name)
    plt.ylabel('Count')
    plt.title(f'Distribution of {feature_name}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
def analyze_categorical_feature( feature_name):
    unique_values = data_train[feature_name].unique()
    value_counts = data_train[feature_name].value_counts()
    
    print(f"Unique values of {feature_name}:")
    print(unique_values)
    print(f"\nValue counts of {feature_name}:")
    print(value_counts)


# Исследование целевой переменной

data_train['target'].value_counts()

# Feature importance 
