In [1]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer,MinMaxScaler,FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
dataframe_date_fruits=pd.read_excel(r"D:\datasets2\Date_Fruit_Datasets\Date_Fruit_Datasets.xlsx")

In [3]:
Target_feature=dataframe_date_fruits['Class']
dataframe_date_fruits_independent_features=dataframe_date_fruits.drop('Class',axis=1)

In [4]:
Target_feature

0      BERHI
1      BERHI
2      BERHI
3      BERHI
4      BERHI
       ...  
893    SOGAY
894    SOGAY
895    SOGAY
896    SOGAY
897    SOGAY
Name: Class, Length: 898, dtype: object

In [5]:
Target_feature.value_counts()

Class
DOKOL     204
SAFAVI    199
ROTANA    166
DEGLET     98
SOGAY      94
IRAQI      72
BERHI      65
Name: count, dtype: int64

In [6]:
encoder=LabelEncoder()
Target_feature=encoder.fit_transform(Target_feature)

In [7]:
label_map = dict(zip(range(len(encoder.classes_)), encoder.classes_))
label_map

{0: 'BERHI',
 1: 'DEGLET',
 2: 'DOKOL',
 3: 'IRAQI',
 4: 'ROTANA',
 5: 'SAFAVI',
 6: 'SOGAY'}

In [8]:
from collections import Counter
print(Counter(Target_feature))

Counter({2: 204, 5: 199, 4: 166, 1: 98, 6: 94, 3: 72, 0: 65})


In [9]:
def Multicollinearity_removing(dataset,threshold):
    col_corr=set()
    corr_matrix=dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i,j]) >threshold:
                colname=corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [10]:
corr_features=Multicollinearity_removing(dataframe_date_fruits_independent_features,0.7)
dataframe_date_fruits_after_removing_Multicollinearity=dataframe_date_fruits_independent_features.drop(corr_features,axis=1)
dataframe_date_fruits_after_removing_Multicollinearity

Unnamed: 0,AREA,ECCENTRICITY,SOLIDITY,EXTENT,ASPECT_RATIO,ROUNDNESS,SHAPEFACTOR_2,MeanRR,SkewRR,EntropyRR
0,422163,0.6373,0.9947,0.7831,1.2976,0.9374,0.0015,117.4466,-0.5661,-59191263232
1,338136,0.5690,0.9974,0.7795,1.2161,0.9773,0.0018,100.0578,-0.2328,-34233065472
2,526843,0.6494,0.9962,0.7657,1.3150,0.9446,0.0014,130.9558,-0.7152,-93948354560
3,416063,0.6266,0.9948,0.7759,1.2831,0.9458,0.0016,86.7798,0.4584,-32074307584
4,347562,0.6465,0.9908,0.7569,1.3108,0.9358,0.0017,105.5484,-0.3552,-39980974080
...,...,...,...,...,...,...,...,...,...,...
893,255403,0.7241,0.9785,0.7269,1.4499,0.8658,0.0019,98.1696,-0.2737,-25296416768
894,365924,0.7644,0.9466,0.6695,1.5510,0.6475,0.0015,93.6238,-0.3319,-31605219328
895,254330,0.8126,0.9925,0.7240,1.7159,0.8609,0.0017,88.2526,0.1838,-22242772992
896,238955,0.7873,0.9604,0.6954,1.6220,0.8263,0.0018,102.4622,0.3400,-26048595968


In [11]:
select=SelectFromModel(RandomForestClassifier(n_estimators=100,random_state=0))
select.fit(dataframe_date_fruits_after_removing_Multicollinearity,Target_feature)

support=select.get_support()

dataframe_date_fruits_after_features_engineering=dataframe_date_fruits_after_removing_Multicollinearity.loc[:,support].columns.tolist()

print(dataframe_date_fruits_after_features_engineering)

print(RandomForestClassifier(n_estimators=100,random_state=0).fit(dataframe_date_fruits_after_removing_Multicollinearity,Target_feature).feature_importances_)

['AREA', 'SHAPEFACTOR_2', 'MeanRR', 'EntropyRR']
[0.19766873 0.05549838 0.05501876 0.0321474  0.05939938 0.08801203
 0.12904975 0.15297646 0.07686728 0.15336183]


In [12]:
dataframe_date_fruits_after_features_engineering=dataframe_date_fruits_after_removing_Multicollinearity.loc[:,support]
dataframe_date_fruits_after_features_engineering

Unnamed: 0,AREA,SHAPEFACTOR_2,MeanRR,EntropyRR
0,422163,0.0015,117.4466,-59191263232
1,338136,0.0018,100.0578,-34233065472
2,526843,0.0014,130.9558,-93948354560
3,416063,0.0016,86.7798,-32074307584
4,347562,0.0017,105.5484,-39980974080
...,...,...,...,...
893,255403,0.0019,98.1696,-25296416768
894,365924,0.0015,93.6238,-31605219328
895,254330,0.0017,88.2526,-22242772992
896,238955,0.0018,102.4622,-26048595968


In [13]:
normalizer = PowerTransformer(method='box-cox')
scaler = MinMaxScaler(feature_range=(1, 2))
#outlier_removal = FunctionTransformer(lambda x : x[(np.abs(stats.zscore(x)) < 3)])
smote = SMOTE(random_state=42)
dataframe_date_fruits_after_features_engineering, Target_feature = smote.fit_resample(dataframe_date_fruits_after_features_engineering, Target_feature)
prepr_pipe = Pipeline([
                 ('scaler', scaler), 
                 ('normalizer', normalizer),
                ])

In [14]:
print(dataframe_date_fruits_after_features_engineering.shape)
print(Counter(Target_feature))

(1428, 4)
Counter({0: 204, 1: 204, 2: 204, 3: 204, 4: 204, 5: 204, 6: 204})


In [15]:
prepr_pipe.fit_transform(dataframe_date_fruits_after_features_engineering)

array([[ 1.02625177, -0.7658461 ,  0.53322739, -1.3166026 ],
       [ 0.11635   ,  0.51485764, -0.14397902, -0.12197684],
       [ 2.32349766, -1.40130096,  1.11694194, -2.39173035],
       ...,
       [-0.53482266,  0.0593582 ,  1.00681257, -0.78730026],
       [-0.40579438,  0.16161616, -0.22227718,  0.24183819],
       [-0.83629096,  1.03452686, -0.42437408,  0.67162354]])

In [16]:
with open('pipeline.pkl', 'wb') as file:
    pickle.dump(prepr_pipe, file)

In [17]:
X_train,X_test,y_train,y_test =train_test_split(dataframe_date_fruits_after_features_engineering,Target_feature,random_state=42,test_size=0.3,stratify=Target_feature)

In [18]:
def train_cv(model, X_train, y_train, params, n_splits=10):
    kf = KFold(n_splits=n_splits, random_state=0, shuffle=True)

    cv = RandomizedSearchCV(model,
                            params,
                            cv=kf,
                            scoring='roc_auc',
                            return_train_score=True,
                            n_jobs=-1,
                            verbose=True,
                            random_state=1
                            )
    cv.fit(X_train, y_train)

    print('Best params', cv.best_params_)
    return cv

In [19]:
kf = KFold(n_splits=10, random_state=0, shuffle=True)
rs_parameters = {
    'n_estimators': [10,20,30,40,50,60,70,80,90,100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    'min_samples_split': (2,3,4,5),
    'min_samples_leaf': [1, 2, 3]
}

# Corrected the model instantiation
rf = RandomForestClassifier(random_state=0, n_jobs=-1)

model_cv_rf = train_cv(rf, X_train, y_train, rs_parameters)
best_estimator_rf = model_cv_rf.best_estimator_

Fitting 10 folds for each of 10 candidates, totalling 100 fits




Best params {'n_estimators': 60, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_depth': 3, 'criterion': 'gini'}


In [20]:
ypred = best_estimator_rf.predict(X_test)
ypred_proba = best_estimator_rf.predict_proba(X_test)

In [21]:
with open('model_random_forest.pkl', 'wb') as file:
    pickle.dump(best_estimator_rf, file)

In [22]:
print(roc_auc_score(y_test, ypred_proba, multi_class='ovr'))
print(classification_report(y_test, ypred))

0.9449136091590136
              precision    recall  f1-score   support

           0       0.68      0.28      0.40        61
           1       0.57      0.77      0.66        61
           2       0.89      0.93      0.91        61
           3       0.67      0.97      0.79        62
           4       0.75      0.74      0.74        61
           5       0.98      0.92      0.95        61
           6       0.65      0.55      0.60        62

    accuracy                           0.74       429
   macro avg       0.74      0.74      0.72       429
weighted avg       0.74      0.74      0.72       429



In [None]:
git init
git add README.md
git commit -m "first commit"
git branch -M main
git remote add origin https://github.com/sahilgarg2814/fastapi_assignment.git
git push -u origin main