In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from hyperopt import fmin,tpe,hp
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import pkg_resources


In [3]:
#Reading the data from CSV
def read_csv(file_path):
    return pd.read_csv(file_path)

#Creating features
def create_features(data):
    #No feature creation for this
    return data

#Training a classifier model
def train_classifier(data):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 40)
    model = RandomForestClassifier()
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test,Y_pred)
    return model, accuracy

#Hyperparameter tuning with Hyperopt
def objective(params):
    model = RandomForestClassifier(**params)
    score = cross_val_score(model, X, Y, cv=5).mean()
    return -score #Minimize negative accuracy

#Evaluating the model on the test data
def evaluate_model(model, X_test, Y_test):
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    return accuracy
    

In [4]:
import os 
os.getcwd()

'C:\\Users\\Abcom\\test'

In [5]:
file_path = "Iris.csv"
data = read_csv(file_path)

In [6]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [7]:
# if __name__ == "__main__":
#Load data 
file_path = "Iris.csv"
data = read_csv(file_path)

#Create Feature
data = create_features(data)

#Split data into features and target
X = data.drop('Species',axis = 1)
Y = data['Species']

#Split data into training and test sets 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 40)

#Define the Pipeline
pipeline = Pipeline([
    ('preprocessor' , ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), X.columns)
        ],
        remainder = 'passthrough'
    )),
    ('classifier' , RandomForestClassifier())
])

#Train the model
pipeline.fit(X_train, Y_train)

#Evaluate the model
Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Model accuracy on test set: {accuracy}")

#Hyperparameter tuning using Tree of Parzen Estimators (TPE)
space = {
    'n_estimators' : hp.choice('n_estimators', range(10, 101)),
    'max_depth' : hp.choice('max_depth', range(1, 21))
}

best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals= 100)
                               


Model accuracy on test set: 1.0
100%|█████████████████████████████████████████████████████████████████████████| 100/100 [00:46<00:00,  2.15trial/s, best loss: -1.0]


In [8]:
best_params

{'max_depth': np.int64(9), 'n_estimators': np.int64(64)}