# Обработка данных о потреблении электроэнергии в домашних условиях

## Подключение библиотек

In [15]:
import pandas as pd
import numpy as np

## Данные

### Считываем данные с файла

In [16]:
file_path = './household_power_consumption.txt'
df_original = pd.read_csv(file_path, sep=';', low_memory=False)

df = df_original.copy()
print(df)

               Date      Time  ... Sub_metering_2 Sub_metering_3
0        16/12/2006  17:24:00  ...          1.000           17.0
1        16/12/2006  17:25:00  ...          1.000           16.0
2        16/12/2006  17:26:00  ...          2.000           17.0
3        16/12/2006  17:27:00  ...          1.000           17.0
4        16/12/2006  17:28:00  ...          1.000           17.0
...             ...       ...  ...            ...            ...
2075254  26/11/2010  20:58:00  ...          0.000            0.0
2075255  26/11/2010  20:59:00  ...          0.000            0.0
2075256  26/11/2010  21:00:00  ...          0.000            0.0
2075257  26/11/2010  21:01:00  ...          0.000            0.0
2075258  26/11/2010  21:02:00  ...          0.000            0.0

[2075259 rows x 9 columns]


## Удаляем строки с "?" 

In [17]:
df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)

### Анализируем кол-во пропущенных данных

In [18]:
missingPercent = df.isnull().mean() * 100
nonMisssingPercent = (1 - df.isnull().mean()) * 100

missingStats = pd.DataFrame({
    "Total": df.isnull().sum(), 
    "Percent Missing": missingPercent.round(2),  
    "Percent Non-Missing": nonMisssingPercent.round(2) 
})

print(missingStats)

                       Total  Percent Missing  Percent Non-Missing
Date                       0              0.0                100.0
Time                       0              0.0                100.0
Global_active_power        0              0.0                100.0
Global_reactive_power      0              0.0                100.0
Voltage                    0              0.0                100.0
Global_intensity           0              0.0                100.0
Sub_metering_1             0              0.0                100.0
Sub_metering_2             0              0.0                100.0
Sub_metering_3             0              0.0                100.0


### Подсчет уникальных значений для каждого признака, чтобы определить категории признаков

In [19]:
unique_counts = df.nunique()
unique_counts_table = pd.DataFrame({
    'Feature': unique_counts.index,
    'Unique Values': unique_counts.values
})

unique_counts_table['Percentage'] = (unique_counts_table['Unique Values'] / len(df)) * 100
print(unique_counts_table)

                 Feature  Unique Values  Percentage
0                   Date           1433    0.069927
1                   Time           1440    0.070269
2    Global_active_power           4186    0.204267
3  Global_reactive_power            532    0.025960
4                Voltage           2837    0.138439
5       Global_intensity            221    0.010784
6         Sub_metering_1             88    0.004294
7         Sub_metering_2             81    0.003953
8         Sub_metering_3             32    0.001562


In [20]:
df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])

df.set_index('datetime', inplace=True)
df.drop(columns=['Date', 'Time'], inplace=True)

df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek  # 0=Пн, 6=Вс
df['is_weekend'] = (df.index.weekday >= 5).astype(int)
df['month'] = df.index.month

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

  df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])


In [21]:
df.to_csv('preprocessed_data.csv', index=False)