In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings(action='ignore')

### read in the csv file

In [None]:
data=pd.read_csv(r'../input/rice-type-classification/riceClassification.csv')
print (data.head())
print (data.isna().sum())
print(data.info())
print (data['Class'].unique())

### there are no missing values to deal with- create a function to process the data frame

### let's look at the dataset balance

In [None]:
print (data['Class'].value_counts())

### dataset is not balanced. define a function that finds the minimum number of samples in any class
### then sets all classes to have that number of samples

In [None]:
def trim(df, column):
    df=df.copy()
    sample_list=[]
    balance=list(df[column].value_counts())
    min_samples=np.min(balance) # least samples in any class
    print ('the minimum number of samples in any class is ', min_samples)
    min_size = 0
    groups=df.groupby(column)
    for label in df[column].unique():                 
        group=groups.get_group(label)
        sample_count=len(group)    
        if sample_count> min_samples :
            samples=group.sample(min_samples, replace=False, weights=None, random_state=123, axis=0).reset_index(drop=True)
            sample_list.append(samples)
        elif sample_count>= min_size:
            sample_list.append(group)
    df=pd.concat(sample_list, axis=0).reset_index(drop=True)
    return df 

In [None]:
def preprocess(df):
    df=df.copy()
    df=df.drop(['id'], axis=1)
    # balance the data set by having samples in each class equal to smallest samples for any class
    df=trim(df, 'Class')
    print (df['Class'].value_counts())
    # partition into target y and data x
    y=df['Class']
    X=df.drop(['Class'], axis=1)
    #split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1)
    # scale the X data    
    scaler = StandardScaler()
    scaler.fit(X_train) # fit only on train data
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test
    


In [None]:
X_train, X_test, y_train, y_test=preprocess(data)
print ('X_train length: ', len(X_train), '  X_test length: ', len(X_test))

In [None]:
X_train

In [None]:
print (y_train.value_counts())

### Training

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "      Decision Tree": DecisionTreeClassifier(),
    "     Neural Network": MLPClassifier(),
    "      Random Forest": RandomForestClassifier(),
    "  Gradient Boosting": GradientBoostingClassifier(),
    " AdaBoostClassifier": AdaBoostClassifier(),
    "KNeighborsClassifier": KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

### Training result by model

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(name + " Accuracy: {:.2f}%".format(acc * 100))

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    print(name + " F1-Score: {:.5f}".format(f1))