# imports

In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from cuml.svm import SVC
from sklearn.metrics import accuracy_score

# colect data

In [2]:
x_DB,y_DB = fetch_openml(name='CIFAR_10',return_X_y=True)

# data clening

In [3]:
x_DB.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 3072 entries, a0 to a3071
dtypes: int64(3072)
memory usage: 1.4 GB


In [4]:
x_DB.head()

Unnamed: 0,a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,...,a3062,a3063,a3064,a3065,a3066,a3067,a3068,a3069,a3070,a3071
0,26,17,13,13,13,14,14,15,14,10,...,250,237,144,33,29,46,28,27,26,27
1,94,101,95,94,94,97,111,142,166,154,...,149,145,147,150,152,163,174,182,184,155
2,183,158,166,167,169,171,163,163,160,161,...,186,174,177,227,250,250,250,250,250,250
3,255,254,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
4,182,156,156,144,118,132,148,118,136,85,...,87,83,83,75,79,72,68,73,67,75


In [5]:
min_all = x_DB.to_numpy().min()
max_all = x_DB.to_numpy().max()

print(min_all)
print(max_all)

0
255


In [6]:
x = x_DB.astype('uint8')
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 3072 entries, a0 to a3071
dtypes: uint8(3072)
memory usage: 175.8 MB


In [7]:
y_DB.info()
y_DB.head()
#y = y_DB

<class 'pandas.core.series.Series'>
RangeIndex: 60000 entries, 0 to 59999
Series name: class
Non-Null Count  Dtype   
--------------  -----   
60000 non-null  category
dtypes: category(1)
memory usage: 59.1 KB


Unnamed: 0,class
0,8
1,5
2,0
3,6
4,9


In [8]:
unique_labels, counts = np.unique(y_DB, return_counts=True)
print(unique_labels)
print(counts)

['0' '1' '2' '3' '4' '5' '6' '7' '8' '9']
[6000 6000 6000 6000 6000 6000 6000 6000 6000 6000]


In [9]:
y = y_DB.astype(int).astype('uint8')

y.info()


<class 'pandas.core.series.Series'>
RangeIndex: 60000 entries, 0 to 59999
Series name: class
Non-Null Count  Dtype
--------------  -----
60000 non-null  uint8
dtypes: uint8(1)
memory usage: 58.7 KB


# mashin

## test_split

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
x_train.info()
print("_______________________")
x_test.info()
print("_______________________")
y_train.info()
print("_______________________")
y_test.info()
print("_______________________")

<class 'pandas.core.frame.DataFrame'>
Index: 48000 entries, 48572 to 56422
Columns: 3072 entries, a0 to a3071
dtypes: uint8(3072)
memory usage: 141.0 MB
_______________________
<class 'pandas.core.frame.DataFrame'>
Index: 12000 entries, 12628 to 36568
Columns: 3072 entries, a0 to a3071
dtypes: uint8(3072)
memory usage: 35.2 MB
_______________________
<class 'pandas.core.series.Series'>
Index: 48000 entries, 48572 to 56422
Series name: class
Non-Null Count  Dtype
--------------  -----
48000 non-null  uint8
dtypes: uint8(1)
memory usage: 421.9 KB
_______________________
<class 'pandas.core.series.Series'>
Index: 12000 entries, 12628 to 36568
Series name: class
Non-Null Count  Dtype
--------------  -----
12000 non-null  uint8
dtypes: uint8(1)
memory usage: 105.5 KB
_______________________


In [12]:
x_train.shape

(48000, 3072)

## sample_weight

In [13]:
def calculate_sample_weight(y_true, y_pred, sample_weight=None):
    n = len(y_true)
    if sample_weight is None:
        sample_weight = np.ones(n) / n

    errors = np.array(y_true != y_pred).astype(np.float64)
    epsilon = np.sum(sample_weight * errors)

    # جلوگیری از تقسیم بر صفر
    if epsilon >= 1.0:
        epsilon = 0.999
    if epsilon <= 0:
        epsilon = 1e-10

    alpha = 0.5 * np.log((1 - epsilon) / epsilon)

    # به‌روزرسانی وزن‌ها
    sample_weight = sample_weight * np.exp(alpha * errors)

    # نرمال‌سازی وزن‌ها
    sample_weight = sample_weight / np.sum(sample_weight)
    return sample_weight, alpha

## Model

In [14]:
from sklearn.base import BaseEstimator, ClassifierMixin


class MyCustomBoostingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, reg_max_iter=100, svm_c_1=1, svm_c_2=1):
        self.reg_max_iter = reg_max_iter
        self.svm_c_1 = svm_c_1
        self.svm_c_2 = svm_c_2
        self.clf_1 = LogisticRegression(max_iter=reg_max_iter)
        self.clf_2 = SVC(C=svm_c_1)
        self.clf_3 = SVC(C=svm_c_2)
        self.alphas = []

    def fit(self, X, y, sample_weight=None):
        self.clf_1.fit(X, y, sample_weight=sample_weight)
        y_pred_1 = self.clf_1.predict(X)
        sample_weight_1, alpha_1 = calculate_sample_weight(y, y_pred_1, sample_weight)
        self.alphas.append(alpha_1)

        self.clf_2.fit(X, y, sample_weight=sample_weight_1)
        y_pred_2 = self.clf_2.predict(X)
        sample_weight_2, alpha_2 = calculate_sample_weight(y, y_pred_2, sample_weight_1)
        self.alphas.append(alpha_2)

        self.clf_3.fit(X, y, sample_weight=sample_weight_2)
        y_pred_3 = self.clf_3.predict(X)
        sample_weight_3, alpha_3 = calculate_sample_weight(y, y_pred_3, sample_weight_2)
        self.alphas.append(alpha_3)

    def predict(self, X):
        y_pred = self.clf_3.predict(X)
        return y_pred



## GridSearchCV

In [18]:
from sklearn.model_selection import GridSearchCV


x_array = x.values
y_array = y.values

model = MyCustomBoostingClassifier()
params = {"reg_max_iter": [10, 50, 100, 200], "svm_c_1": [0.1, 0.5, 1, 5], "svm_c_2": [0.1, 0.5, 1, 5]}

grid_model = GridSearchCV(model, param_grid=params, cv=2, scoring="accuracy", verbose=3, n_jobs=-1)
grid_model.fit(x_array, y_array)

best_model = grid_model.best_estimator_
print("Best parameters:", grid_model.best_params_)
print("Best score:", grid_model.best_score_)

Fitting 2 folds for each of 64 candidates, totalling 128 fits


KeyboardInterrupt: 

In [20]:
# تبدیل به numpy array برای جلوگیری از مشکل index
x_train_array = x_train.values
y_train_array = y_train.values
x_test_array = x_test.values
y_test_array = y_test.values

# ایجاد مدل
model = MyCustomBoostingClassifier(reg_max_iter=100, svm_c_1=1, svm_c_2=1)

# آموزش مدل
print("شروع آموزش مدل...")
model.fit(x_train_array, y_train_array)
print("آموزش مدل تمام شد!")

# پیش‌بینی
print("شروع پیش‌بینی...")
y_pred = model.predict(x_test_array)

# نمایش دقت
accuracy = accuracy_score(y_test_array, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# نمایش اطلاعات بیشتر
print(f"تعداد نمونه‌های تست: {len(y_test_array)}")
print(f"تعداد پیش‌بینی‌های درست: {sum(y_test_array == y_pred)}")
print(f"تعداد پیش‌بینی‌های غلط: {sum(y_test_array != y_pred)}")

# نمایش توزیع کلاس‌ها
print("\nتوزیع کلاس‌ها در داده‌های تست:")
unique_test, counts_test = np.unique(y_test_array, return_counts=True)
for label, count in zip(unique_test, counts_test):
    print(f"کلاس {label}: {count} نمونه")

شروع آموزش مدل...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


آموزش مدل تمام شد!
شروع پیش‌بینی...
Accuracy: 0.5380
تعداد نمونه‌های تست: 12000
تعداد پیش‌بینی‌های درست: 6456
تعداد پیش‌بینی‌های غلط: 5544

توزیع کلاس‌ها در داده‌های تست:
کلاس 0: 1193 نمونه
کلاس 1: 1211 نمونه
کلاس 2: 1218 نمونه
کلاس 3: 1208 نمونه
کلاس 4: 1168 نمونه
کلاس 5: 1203 نمونه
کلاس 6: 1185 نمونه
کلاس 7: 1241 نمونه
کلاس 8: 1183 نمونه
کلاس 9: 1190 نمونه
