In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# –ü–µ—Ä–≤—ã–π —ç—Ç–∞–ø. –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö

In [4]:
df = pd.read_csv("data.csv")

In [5]:
df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [6]:
features, labels = df.iloc[:, :-1], df.iloc[:, -1]

In [10]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, stratify=labels, test_size=0.2)

# –í—Ç–æ—Ä–æ–π —ç—Ç–∞–ø. –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [11]:
columns_to_drop = [] # –ø–µ—Ä–µ—á–∏—Å–ª–∏—Ç—å –∫–æ–ª–æ–Ω–∫–∏ –∫ –¥—Ä–æ–ø—É, –µ—Å–ª–∏ –µ—Å—Ç—å

In [12]:
X = features_train.copy()  # —á—Ç–æ–±—ã –Ω–µ –º–æ–¥–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞—Ç—å —Å–ª—É—á–∞–π–Ω–æ

## –û–ø—Ä–µ–¥–µ–ª—è–µ–º —á–∏—Å–ª–æ–≤—ã–µ –∏ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏

In [14]:
numeric_columns = X.select_dtypes(include=["int64", "float64"]).drop(columns=columns_to_drop).columns.tolist()

In [16]:
cat_columns = X.select_dtypes(include=["object", "category"]).drop(columns=columns_to_drop).columns.tolist()

## –ö–ª–∞—Å—Å –¥–ª—è –¥—Ä–æ–ø–∞ –∫–æ–ª–æ–Ω–æ–∫

In [18]:
class ColumnDrop:
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(columns=self.columns)

## –ü—Ä–∏–≤–µ–¥–µ–Ω–∏–µ —Ç–∏–ø–æ–≤

In [19]:
def to_numeric(df):
    df = df.copy()
    for col in df.columns:
        df[col] = pd.to_numeric(df[col])
    return df

In [22]:
type_converter = FunctionTransformer(to_numeric)

In [24]:
numeric_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])

In [25]:
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [29]:
column_transformer = ColumnTransformer([
    ('num', numeric_pipe, numeric_columns),
    ('cat', categorical_pipeline, cat_columns)
])

In [31]:
preprocessing_pipeline = Pipeline([
    ('dropper', ColumnDrop(columns_to_drop)),
    ('typefix', type_converter),
    ('transformer', column_transformer)
])

In [33]:
preprocessing_pipeline.fit(features_train)

In [34]:
X_train_prepared = preprocessing_pipeline.transform(features_train)
X_test_prepared = preprocessing_pipeline.transform(features_test)

In [36]:
joblib.dump(preprocessing_pipeline, 'preprocessing.pkl')

['preprocessing.pkl']

In [38]:
models = {
    'logreg': (LogisticRegression(max_iter=1000), {
        'C': [0.1, 1.0, 10.0]
    }),
    'rf': (RandomForestClassifier(), {
        'n_estimators': [50, 100],
        'max_depth': [5, 10, None]
    }),
    'gb': (GradientBoostingClassifier(), {
        'n_estimators': [50, 100],
        'learning_rate': [0.05, 0.1]
    }),
    'svc': (SVC(), {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear']
    })
}

In [40]:
best_model = None
best_score = -1
best_name = None

for name, (model, param_grid) in models.items():
    print(f"\nüîç –û–±—É—á–∞–µ–º {name}...")
    grid = GridSearchCV(model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train_prepared, labels_train)

    y_pred = grid.predict(X_test_prepared)
    acc = accuracy_score(labels_test, y_pred)
    print(f"‚úÖ Accuracy: {acc:.4f} (–ª—É—á—à–∞—è –º–æ–¥–µ–ª—å: {grid.best_params_})")

    if acc > best_score:
        best_score = acc
        best_model = grid.best_estimator_
        best_name = name


üîç –û–±—É—á–∞–µ–º logreg...
‚úÖ Accuracy: 0.9909 (–ª—É—á—à–∞—è –º–æ–¥–µ–ª—å: {'C': 10.0})

üîç –û–±—É—á–∞–µ–º rf...
‚úÖ Accuracy: 0.9977 (–ª—É—á—à–∞—è –º–æ–¥–µ–ª—å: {'max_depth': None, 'n_estimators': 50})

üîç –û–±—É—á–∞–µ–º gb...
‚úÖ Accuracy: 0.9977 (–ª—É—á—à–∞—è –º–æ–¥–µ–ª—å: {'learning_rate': 0.1, 'n_estimators': 50})

üîç –û–±—É—á–∞–µ–º svc...
‚úÖ Accuracy: 0.9886 (–ª—É—á—à–∞—è –º–æ–¥–µ–ª—å: {'C': 10, 'kernel': 'linear'})


In [41]:
joblib.dump(best_model, f'model_{best_name}.pkl')
print(f"\nüèÜ –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å: {best_name.upper()} ‚Äî accuracy: {best_score:.4f}")


üèÜ –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å: RF ‚Äî accuracy: 0.9977


In [42]:
full_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),  # —Ç–≤–æ–π ColumnTransformer –≤–Ω—É—Ç—Ä–∏
    ('model', best_model)                       # –Ω–∞–ø—Ä–∏–º–µ—Ä, RandomForestClassifier
])

In [43]:
full_pipeline.fit(features_train, labels_train)

In [44]:
joblib.dump(full_pipeline, 'final_pipeline.pkl')

['final_pipeline.pkl']