In [1]:
import pandas as pd

df = pd.read_csv('SteelPlatesFaults.csv')
print(df.shape)
print(df.head())

(1941, 34)
   X_Minimum  X_Maximum  Y_Minimum  Y_Maximum  Pixels_Areas  X_Perimeter  \
0         42         50     270900     270944           267           17   
1        645        651    2538079    2538108           108           10   
2        829        835    1553913    1553931            71            8   
3        853        860     369370     369415           176           13   
4       1289       1306     498078     498335          2409           60   

   Y_Perimeter  Sum_of_Luminosity  Minimum_of_Luminosity  \
0           44              24220                     76   
1           30              11397                     84   
2           19               7972                     99   
3           45              18996                     99   
4          260             246930                     37   

   Maximum_of_Luminosity  ...  Orientation_Index  Luminosity_Index  \
0                    108  ...             0.8182           -0.2913   
1                    123  ...  

In [2]:
print(df.isnull().sum())

X_Minimum                0
X_Maximum                0
Y_Minimum                0
Y_Maximum                0
Pixels_Areas             0
X_Perimeter              0
Y_Perimeter              0
Sum_of_Luminosity        0
Minimum_of_Luminosity    0
Maximum_of_Luminosity    0
Length_of_Conveyer       0
TypeOfSteel_A300         0
TypeOfSteel_A400         0
Steel_Plate_Thickness    0
Edges_Index              0
Empty_Index              0
Square_Index             0
Outside_X_Index          0
Edges_X_Index            0
Edges_Y_Index            0
Outside_Global_Index     0
LogOfAreas               0
Log_X_Index              0
Log_Y_Index              0
Orientation_Index        0
Luminosity_Index         0
SigmoidOfAreas           0
Pastry                   0
Z_Scratch                0
K_Scatch                 0
Stains                   0
Dirtiness                0
Bumps                    0
Other_Faults             0
dtype: int64


In [3]:
print(df.dtypes)

X_Minimum                  int64
X_Maximum                  int64
Y_Minimum                  int64
Y_Maximum                  int64
Pixels_Areas               int64
X_Perimeter                int64
Y_Perimeter                int64
Sum_of_Luminosity          int64
Minimum_of_Luminosity      int64
Maximum_of_Luminosity      int64
Length_of_Conveyer         int64
TypeOfSteel_A300           int64
TypeOfSteel_A400           int64
Steel_Plate_Thickness      int64
Edges_Index              float64
Empty_Index              float64
Square_Index             float64
Outside_X_Index          float64
Edges_X_Index            float64
Edges_Y_Index            float64
Outside_Global_Index     float64
LogOfAreas               float64
Log_X_Index              float64
Log_Y_Index              float64
Orientation_Index        float64
Luminosity_Index         float64
SigmoidOfAreas           float64
Pastry                     int64
Z_Scratch                  int64
K_Scatch                   int64
Stains    

In [4]:
target_cols = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
counts = df[target_cols].sum().sort_values(ascending=False)
print(counts)

Other_Faults    673
Bumps           402
K_Scatch        391
Z_Scratch       190
Pastry          158
Stains           72
Dirtiness        55
dtype: int64


In [5]:
a = 673/1940
b = 55/1940
print('Разница в: ' + str(round((a/b), 2)) + ' раз')
print('Самый частый:')
print(str(round((a * 100), 2)) + '%')
print('Самый редкий:')
print(str(round((b * 100), 2)) + '%')

Разница в: 12.24 раз
Самый частый:
34.69%
Самый редкий:
2.84%


In [6]:
y = df[target_cols].idxmax(axis=1)
print(y.head())
print(y.value_counts())

0    Pastry
1    Pastry
2    Pastry
3    Pastry
4    Pastry
dtype: str
Other_Faults    673
Bumps           402
K_Scatch        391
Z_Scratch       190
Pastry          158
Stains           72
Dirtiness        55
Name: count, dtype: int64


In [7]:
X = df.drop(columns=target_cols)
print(X.describe())
print(X.info())

         X_Minimum    X_Maximum     Y_Minimum     Y_Maximum   Pixels_Areas  \
count  1941.000000  1941.000000  1.941000e+03  1.941000e+03    1941.000000   
mean    571.136012   617.964451  1.650685e+06  1.650739e+06    1893.878413   
std     520.690671   497.627410  1.774578e+06  1.774590e+06    5168.459560   
min       0.000000     4.000000  6.712000e+03  6.724000e+03       2.000000   
25%      51.000000   192.000000  4.712530e+05  4.712810e+05      84.000000   
50%     435.000000   467.000000  1.204128e+06  1.204136e+06     174.000000   
75%    1053.000000  1072.000000  2.183073e+06  2.183084e+06     822.000000   
max    1705.000000  1713.000000  1.298766e+07  1.298769e+07  152655.000000   

        X_Perimeter   Y_Perimeter  Sum_of_Luminosity  Minimum_of_Luminosity  \
count   1941.000000   1941.000000       1.941000e+03            1941.000000   
mean     111.855229     82.965997       2.063121e+05              84.548686   
std      301.209187    426.482879       5.122936e+05        

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [11]:
from sklearn.metrics import accuracy_score, f1_score

acc = accuracy_score(y_test, y_pred_lr)
print(acc)
f1 = f1_score(y_test, y_pred_lr, average='macro')
print(f1)

0.7275064267352185
0.7417891382899583


In [12]:
from sklearn.metrics import accuracy_score, f1_score

acc = accuracy_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf, average = 'macro')
print(acc)
print(f1)

0.794344473007712
0.8025116826821165


In [13]:
from sklearn.ensemble import RandomForestClassifier
rf_bal = RandomForestClassifier(class_weight = 'balanced', random_state=42)
rf_bal.fit(X_train, y_train)
y_pred_bal = rf_bal.predict(X_test)
f1_bal = f1_score(y_test, y_pred_bal, average='macro')
print(f1_bal)

0.8074112231078175


In [14]:
corr_matrix = X.corr()
abs_corr = corr_matrix.abs()

corr_pairs = abs_corr.unstack()
unique_pairs = corr_pairs[corr_pairs.index.get_level_values(0) < corr_pairs.index.get_level_values(1)]
high_corr = unique_pairs[unique_pairs > 0.95]
print(high_corr)

X_Maximum         X_Minimum            0.988314
Y_Maximum         Y_Minimum            1.000000
Pixels_Areas      X_Perimeter          0.966644
                  Sum_of_Luminosity    0.978952
TypeOfSteel_A300  TypeOfSteel_A400     1.000000
dtype: float64


In [15]:
cols_to_drop = [
    'X_Maximum',
    'Y_Maximum',
    'X_Perimeter',
    'Sum_of_Luminosity',
    'TypeOfSteel_A400'
]
X_reduced = X.drop(columns=cols_to_drop)
print(X.shape[1])
print(X_reduced.shape[1])
print(cols_to_drop)

27
22
['X_Maximum', 'Y_Maximum', 'X_Perimeter', 'Sum_of_Luminosity', 'TypeOfSteel_A400']


In [16]:
X_train_red, X_test_red, y_train, y_test = train_test_split(
    X_reduced, y, test_size=0.2, stratify=y, random_state=42
)
rf_red = RandomForestClassifier(random_state=42)
rf_red.fit(X_train_red, y_train)
y_pred_red = rf_red.predict(X_test_red)
f1_red = f1_score(y_test, y_pred_red, average='macro')

print(f1_red)

0.789520277771148


In [17]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=10)
X_train_sel = selector.fit_transform(X_train, y_train)
X_test_sel = selector.transform(X_test)
rf_selected = RandomForestClassifier(random_state=42)
rf_selected = rf_selected.fit(X_train_sel, y_train)
y_pred_sel = rf_selected.predict(X_test_sel)
f1_sel = f1_score(y_test, y_pred_sel, average='macro')

print(f1_sel)

0.576512515688186


In [18]:
from sklearn.model_selection import cross_val_score

rf_baseline = RandomForestClassifier(random_state=42)
cv_scores = cross_val_score(rf_baseline, X, y, cv=5, scoring='f1_macro')

print("F1-macro:", round(cv_scores.mean(), 4), "±", round(cv_scores.std(), 4))

F1-macro: 0.6374 ± 0.0688


In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(round(grid_search.best_score_, 4))

y_pred_tuned = grid_search.predict(X_test)
f1_tuned = f1_score(y_test, y_pred_tuned, average='macro')
print(round(f1_tuned, 4))

{'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
0.7916
0.8073
