ИМПОРТ БИБЛИОТЕК

In [1]:
import pandas as pd
import os
import glob
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from dict import file_categories
from categories import category_map

ЗАГРУЗКА ФАЙЛОВ

In [2]:
path = "C:/Users/User/Downloads/archive (2)/files"
csv_files = glob.glob(os.path.join(path, "*.csv"))
print(f"всего файлов - {len(csv_files)}")

всего файлов - 30


ОБЪЕДИНЯЕМ ФАЙЛЫ В ОДИН ДАТАФРЕЙМ С РАЗМЕТКОЙ

1 читаем файл из /files

2 добавляем в дф категорию (разметку)

3 

In [4]:
def get_category_by_number(file_name):
    file_number = file_name.split(' - ')[0]   
    return category_map.get(file_number, 'Unknown')

In [5]:
all_data = []

for file in csv_files:
    file_name = os.path.basename(file)
    
    try:
        category = get_category_by_number(file_name)
        df = pd.read_csv(file)
        df['category'] = category
        all_data.append(df)

    except Exception as e:
        print(f"Ошибка загрузки {file_name}: {e}")

In [6]:
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)

    print("ОБЗОР ДАННЫХ:")
    print(f"Общий размер: {combined_df.shape}")
    print(f"Общее количество записей: {len(combined_df):,}")
    
    print("\nРАСПРЕДЕЛЕНИЕ ПО КАТЕГОРИЯМ:")
    category_counts = combined_df['category'].value_counts()
    for category, count in category_counts.items():
        percentage = (count / len(combined_df)) * 100
        print(f"  {category}: {count:,} записей ({percentage:.1f}%)")

    output_file = "dataset_complete.csv"
    combined_df.to_csv(output_file, index=False)
    print(f"\nданные сохранены в файл: {output_file}")

ОБЗОР ДАННЫХ:
Общий размер: (3532165, 5)
Общее количество записей: 3,532,165

РАСПРЕДЕЛЕНИЕ ПО КАТЕГОРИЯМ:
  Mechanical and Electrical fault: 507,682 записей (14.4%)
  Mechanical and Electrical fault with noise: 416,871 записей (11.8%)
  Electrical fault with noise: 319,095 записей (9.0%)
  Electrical fault: 313,321 записей (8.9%)
  Mechanical and Electrical fault with load and noise: 282,689 записей (8.0%)
  Mechanical fault (shaft misalignment): 243,313 записей (6.9%)
  Electrical fault with load and noise: 215,550 записей (6.1%)
  Electrical fault with load: 206,158 записей (5.8%)
  Mechanical fault with high noise: 155,047 записей (4.4%)
  Mechanical fault with noise: 153,466 записей (4.3%)
  Mechanical fault (shaft misalignment) with load: 151,123 записей (4.3%)
  Mechanical fault with load and noise: 151,016 записей (4.3%)
  Normal operation: 107,346 записей (3.0%)
  Normal operation with load and noise: 104,468 записей (3.0%)
  Normal operation with load: 102,909 записей (2.9%)


In [7]:
print(combined_df.head())

   Timestamp    AccX   AccY     AccZ          category
0    2141864  3624.0 -567.0  17452.0  Normal operation
1    2143688  -649.0 -688.0  16899.0  Normal operation
2    2145492   -21.0  -66.0  14555.0  Normal operation
3    2147308 -1207.0  923.0  12459.0  Normal operation
4    2149108   208.0  -16.0  14951.0  Normal operation


структура 

1 временная метка

2 показания акселерометра ось х

3 показания акселерометра ось y

4 показания акселерометра ось z

5 категория 

In [8]:
print(combined_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3532165 entries, 0 to 3532164
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Timestamp  int64  
 1   AccX       float64
 2   AccY       float64
 3   AccZ       float64
 4   category   object 
dtypes: float64(3), int64(1), object(1)
memory usage: 134.7+ MB
None


пропусков нет, типы данных корректны

In [9]:
print("Пропуски в данных:")
print(combined_df.isnull().sum())
print(f"Всего пропусков: {combined_df.isnull().sum().sum()}")

Пропуски в данных:
Timestamp    0
AccX         1
AccY         1
AccZ         2
category     0
dtype: int64
Всего пропусков: 4


In [10]:
combined_df = combined_df.dropna()

In [11]:
print("Пропуски в данных:")
print(combined_df.isnull().sum())
print(f"Всего пропусков: {combined_df.isnull().sum().sum()}")

Пропуски в данных:
Timestamp    0
AccX         0
AccY         0
AccZ         0
category     0
dtype: int64
Всего пропусков: 0


In [12]:
X = combined_df[['AccX', 'AccY', 'AccZ']] 
y = combined_df['category'] 

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (3532163, 3)
y shape: (3532163,)


In [13]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Уникальные классы: {len(np.unique(y_encoded))}")
print("Соответствие классов:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"  {i}: {class_name}")


Уникальные классы: 16
Соответствие классов:
  0: Electrical fault
  1: Electrical fault with load
  2: Electrical fault with load and noise
  3: Electrical fault with noise
  4: Mechanical and Electrical fault
  5: Mechanical and Electrical fault with load and noise
  6: Mechanical and Electrical fault with noise
  7: Mechanical fault (shaft misalignment)
  8: Mechanical fault (shaft misalignment) with load
  9: Mechanical fault with high noise
  10: Mechanical fault with load and noise
  11: Mechanical fault with noise
  12: Normal operation
  13: Normal operation with load
  14: Normal operation with load and noise
  15: Normal operation with noise


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.3, 
    stratify=y_encoded, 
    random_state=42
)

print(f"Обучающая выборка: {X_train.shape}, {y_train.shape}")
print(f"Тестовая выборка: {X_test.shape}, {y_test.shape}")

Обучающая выборка: (2472514, 3), (2472514,)
Тестовая выборка: (1059649, 3), (1059649,)


In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
input_dim = X_train_scaled.shape[1]
num_classes = len(np.unique(y_encoded))


In [None]:
model = Sequential()
model.add(Dense(64, input_dim=input_dim, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax')) 

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam', 
    metrics=['accuracy']
)

print(model.summary())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None


In [20]:
history = model.fit(
    x=X_train_scaled, 
    y=y_train, 
    batch_size=64, 
    epochs=15,
    validation_data=(X_test_scaled, y_test), 
    verbose=1
)


Epoch 1/15
[1m38634/38634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 2ms/step - accuracy: 0.1815 - loss: 2.3931 - val_accuracy: 0.1839 - val_loss: 2.3843
Epoch 2/15
[1m38634/38634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2ms/step - accuracy: 0.1836 - loss: 2.3849 - val_accuracy: 0.1823 - val_loss: 2.3859
Epoch 3/15
[1m38634/38634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 2ms/step - accuracy: 0.1840 - loss: 2.3841 - val_accuracy: 0.1846 - val_loss: 2.3840
Epoch 4/15
[1m38634/38634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 2ms/step - accuracy: 0.1842 - loss: 2.3837 - val_accuracy: 0.1845 - val_loss: 2.3833
Epoch 5/15
[1m38634/38634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2ms/step - accuracy: 0.1842 - loss: 2.3834 - val_accuracy: 0.1837 - val_loss: 2.3838
Epoch 6/15
[1m38634/38634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2ms/step - accuracy: 0.1842 - loss: 2.3831 - val_accuracy: 0.1840 - val_loss: 2.383

In [21]:
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"\nТочность на тестовой выборке: {test_accuracy:.4f}")


Точность на тестовой выборке: 0.1840
