In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('bmh')
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MaxAbsScaler,PowerTransformer,MinMaxScaler,RobustScaler, StandardScaler, Normalizer, QuantileTransformer

from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.decomposition import PCA

# models
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,VotingClassifier,\
GradientBoostingClassifier,StackingClassifier,VotingClassifier,HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression,Perceptron,RidgeClassifier,RidgeClassifierCV,SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from scipy import stats
from imblearn.over_sampling import SMOTE

import time


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [86]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train=pd.read_csv('/kaggle/input/dry-beans-classification-iti-ai-pro-intake01/train.csv',index_col='ID')
test=pd.read_csv('/kaggle/input/dry-beans-classification-iti-ai-pro-intake01/test.csv',index_col='ID')
test_ID=test.index

In [87]:
train.head()

In [88]:
test.head()

In [89]:
train.describe()

In [90]:
train.info()

In [91]:
train.y.unique()

In [92]:
print(train.duplicated().sum())
print(test.duplicated().sum())

In [93]:
train.columns

In [94]:
# sns.pairplot(train,hue='y')

In [95]:
train.skew(axis=0)

In [96]:
plt.figure(figsize=(10,7))
sns.heatmap(train.corr(),annot=True)

In [97]:
# train['y'] = pd.factorize(train['y'])[0].reshape(-1, 1)
train['y'] = train['y'].map({'HOROZ':0, 'SEKER':1, 'DERMASON':2, 'SIRA':3, 'BARBUNYA':4, 'CALI':5, 'BOMBAY':6})

In [98]:
train.y.unique()

In [99]:
print(train.shape)
print(test.shape)

# **Detecting and dealing with Outliers**

In [100]:
plt.figure(figsize=(25, 25))
for i, col in enumerate(list(train.columns)):
    plt.subplot(7, 4, i+1)
    sns.histplot(train[col], kde=True, bins=10)

In [101]:
plt.figure(figsize=(25, 25))
for i, col in enumerate(list(train.columns)):
    plt.subplot(7, 4, i+1)
    sns.boxplot(train[train['y'] == 0][col])

In [102]:
# #remove all outliers
# from scipy import stats
# z_scores = stats.zscore(train)
# abs_z_scores = np.abs(z_scores)
# filtered_entries = (abs_z_scores < 2).all(axis=1)
# train = train[filtered_entries]

In [103]:
Y=train['y']
train.drop('y',axis='columns',inplace=True)

In [104]:
# #boxcox outliers handling
# for col in train.columns:
#     train[col],fitted_lambda= stats.boxcox(train[col] ,lmbda=None)
#     test[col],fitted_lambda= stats.boxcox(test[col] ,lmbda=None)

In [105]:
# #imputing outliers using median
# for col in train.columns:
#     q1 = train[col].quantile(0.25)
#     q3 = train[col].quantile(0.75)
#     iqr = q3-q1
#     Lower_tail = q1 - 1.5 * iqr
#     Upper_tail = q3 + 1.5 * iqr
#     m = np.median(train[col])
#     for i in train[col]:
#         if i > Upper_tail or i < Lower_tail:
#                 train[col] = train[col].replace(i, m)
    
#     q1 = test[col].quantile(0.25)
#     q3 = test[col].quantile(0.75)
#     iqr = q3-q1
#     Lower_tail = q1 - 1.5 * iqr
#     Upper_tail = q3 + 1.5 * iqr
#     m = np.median(test[col])
#     for i in test[col]:
#         if i > Upper_tail or i < Lower_tail:
#                 test[col] = test[col].replace(i, m)

# **Balancing Data**

In [106]:
#balancing Dataset
print(train.shape)
print(Y.value_counts())
oversample = SMOTE(sampling_strategy={2:3000,6:1300,4:1300,1:2000})
train, Y = oversample.fit_resample(train, Y)
print(train.shape)
print(Y.value_counts())

In [107]:
scaler=PowerTransformer()
train=pd.DataFrame(scaler.fit_transform(train),columns=train.columns)
test=pd.DataFrame(scaler.transform(test),columns=test.columns)

In [108]:
train.describe()

In [109]:
# plt.figure(figsize=(25, 25))
# for i, col in enumerate(list(train.columns)):
#     plt.subplot(7, 4, i+1)
#     sns.histplot(train[col], kde=True, bins=10)

In [110]:
# plt.figure(figsize=(25, 25))
# for i, col in enumerate(list(train.columns)):
#     plt.subplot(7, 4, i+1)
#     sns.boxplot(train[col])
# print(train.shape)

In [111]:
plt.figure(figsize=(10,7))
sns.heatmap(train.corr(),annot=True)

In [112]:
train.columns

In [113]:
# train=train[['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
#        'Eccentricity', 'EquivDiameter', 'Extent',
#        'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
#        'ShapeFactor3', 'ShapeFactor4']]
# test=test[['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
#        'Eccentricity', 'EquivDiameter', 'Extent',
#        'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
#        'ShapeFactor3', 'ShapeFactor4']]

In [114]:
X_train, X_valid, y_train, y_valid = train_test_split(train,Y, train_size=0.8,random_state=42,stratify=Y)

In [115]:
# rf=RandomForestClassifier()
# ada=AdaBoostClassifier()
# et=ExtraTreesClassifier()
# gbc=GradientBoostingClassifier()  #excluded due to very long training time and not getting best results
# hgbc=HistGradientBoostingClassifier()
# per=Perceptron()
# rc=RidgeClassifier()
# rcv=RidgeClassifierCV()
# sgd=SGDClassifier()
# dt=DecisionTreeClassifier()
# svm=SVC()
# xgb=XGBClassifier()
# catb=CatBoostClassifier(verbose=None)
# knn=KNeighborsClassifier(7)
# mlp=MLPClassifier()


# models=[rf,ada,et,hgbc,per,rc,rcv,sgd,dt,svm,xgb,knn,mlp] 
# # models=[svm,ada]

# for model in models:
#     start=time.time()
#     grid=GridSearchCV(estimator=model,param_grid={},scoring='accuracy',cv=10,verbose=1)
#     grid.fit(train,Y)
#     end = time.time()
#     print(model, '\n', grid.best_score_,'\n', round(end-start))

In [116]:
# voting_est=[('SVC',SVC()),('MultiLayer Perceptron',MLPClassifier()),('XGB',XGBClassifier())]
# vc=VotingClassifier(estimators=voting_est)


# models=[vc] 
# # models=[svm,ada]

# for model in models:
#     start=time.time()
#     grid=GridSearchCV(estimator=model,param_grid={},scoring='accuracy',cv=10,verbose=1)
#     grid.fit(train,Y)
#     end = time.time()
#     print(model, '\n', grid.best_score_,'\n', round(end-start))

In [117]:
# params={'kernel':['rbf'],
#        'C':[2,2.1,2.15,2.2,2.3],
#        'gamma':[0.25],
# #        'class_weight':[None,'balanced'],
# #        'max_iter':[-1,20,50,100,300],
#        'decision_function_shape':['ovr'],
#        'break_ties':[True]
#        }
# model = GridSearchCV(estimator=SVC(), param_grid=params, scoring='accuracy', cv=15,verbose=1)
# model.fit(train,Y)
# print(model.best_params_)
# print(model.best_estimator_)
# print(model.best_score_)
# best_model=model.best_estimator_

In [118]:
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
def scoring(y_true,y_pred):
    return f1_score(y_true,y_pred,average=None).mean()
Myscorer=make_scorer(scoring)

In [119]:
params={'hidden_layer_sizes':[i for i in range(50,61)],
#        'activation':['identity', 'logistic', 'tanh', 'relu'],
#        'early_stopping':[True],
#         'beta_1' :[.7],        
       }
model = GridSearchCV(estimator=MLPClassifier(random_state = 158), param_grid=params, scoring=Myscorer, cv=10,verbose=3)
model.fit(train,Y)
print(model.best_params_)
print(model.best_estimator_)
print(model.best_score_)
best_model=model.best_estimator_

In [120]:
# from sklearn.inspection import permutation_importance
# perm_importance = permutation_importance(best_model, X_valid, y_valid)
# feature_names=train.columns
# sorted_idx = perm_importance.importances_mean.argsort()
# plt.barh(feature_names[sorted_idx], perm_importance.importances_mean[sorted_idx])
# plt.xlabel("Permutation Importance")

In [121]:
pred=best_model.predict(test)
predictions = pd.DataFrame({'ID':test_ID,
                       'y': pred})

In [122]:
predictions['y']=predictions['y'].map({0:'HOROZ', 1:'SEKER', 2:'DERMASON', 3:'SIRA', 4:'BARBUNYA', 5:'CALI',6:'BOMBAY'})

In [123]:
predictions

In [124]:
predictions.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")