In [None]:
Name:- Shamal Kadbe

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')



In [None]:
df=pd.read_excel('Beans_Multiclass_Classification.xlsx')
df

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Assuming 'Class' is your target column
feature_columns = [col for col in df.columns if col != 'Class']


In [None]:
sns.countplot(data=df, x='Class')
plt.title("Class Distribution")
plt.xticks(rotation=45)
plt.show()

class_counts = df['Class'].value_counts(normalize=True)
print("Class proportions:\n", class_counts)


In [None]:
corr = df[feature_columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
sns.pairplot(data=df, vars=feature_columns[:5], hue='Class', diag_kind='kde')  # Limit to a few features
plt.suptitle("Multivariate Relationships by Class", y=1.02)
plt.show()


In [None]:
# Set visualization style
sns.set(style="whitegrid")

# Histograms for all numerical features
df.hist(figsize=(18, 14), bins=20, edgecolor='black')
plt.suptitle("Feature Distributions (Histograms)", fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Boxplots for all numerical features
plt.figure(figsize=(18, 12))
df.drop("Class", axis=1).boxplot(rot=90)
plt.title("Feature Distributions (Boxplots)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



#### Observations:-

#####  1. Feature Distribution Insights:-
##### Several features show skewness (e.g., Area, MajorAxisLength) â€” consider log transformation.
##### Outliers detected in features like Eccentricity and Extent via boxplots â€” flag for handling or justification.

##### 2. Class Distribution:-
##### Data is imbalanced across classes â€” majority class (SEKER) dominates.
##### Minority classes (BARBUNYA, HOROZ) may need oversampling (e.g., SMOTE) or class-weight adjustment in model.

##### 3. Feature Correlations:-
##### Strong positive correlation between Perimeter and MajorAxisLength (r > 0.85).
##### Negative correlation between Eccentricity and Compactness suggests non-linear relationships.

##### 4. Multivariate Relationships:-
##### Pairplot shows clear class separation along AspectRatio and Solidity.
##### Some features show overlapping distributions across classes â€” might require dimensionality reduction or interaction terms.

##### 5. Implications for Modeling:-
##### Imbalance indicates a need for stratified sampling and evaluation via metrics like F1-score or AUC.
##### Redundant features could be dropped or used in PCA for compact representations.
##### Consider feature scaling for models sensitive to magnitude (e.g., SVM, KNN).

In [13]:
from scipy.stats import zscore
z_scores = zscore(df[feature_columns])
outliers = (np.abs(z_scores) > 3)
print("Outlier counts:\n", outliers.sum(axis=0))


Outlier counts:
 Area               483
Perimeter          404
MajorAxisLength    316
MinorAxisLength    508
AspectRation        15
Eccentricity       125
ConvexArea         483
EquivDiameter      465
Extent             135
Solidity           238
roundness           74
Compactness          1
ShapeFactor1        59
ShapeFactor2         5
ShapeFactor3         8
ShapeFactor4       242
dtype: int64


In [14]:
for col in feature_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f'{col}: {outliers.shape[0]} outliers')


Area: 551 outliers
Perimeter: 500 outliers
MajorAxisLength: 379 outliers
MinorAxisLength: 569 outliers
AspectRation: 473 outliers
Eccentricity: 843 outliers
ConvexArea: 550 outliers
EquivDiameter: 526 outliers
Extent: 275 outliers
Solidity: 778 outliers
roundness: 91 outliers
Compactness: 109 outliers
ShapeFactor1: 533 outliers
ShapeFactor2: 0 outliers
ShapeFactor3: 195 outliers
ShapeFactor4: 767 outliers


In [15]:
X = df.drop('Class', axis=1)   # Features
y = df['Class']                # Target variable


In [16]:
# Check skewness
skewed_features = X.skew().sort_values(ascending=False)
print("Skewed features:\n", skewed_features[abs(skewed_features) > 1])

# Apply log1p transformation to highly skewed features (if needed)
X[skewed_features[abs(skewed_features) > 1].index] = np.log1p(X[skewed_features[abs(skewed_features) > 1].index])


Skewed features:
 Area               2.952931
ConvexArea         2.941821
MinorAxisLength    2.238211
EquivDiameter      1.948958
Perimeter          1.626124
MajorAxisLength    1.357815
Eccentricity      -1.062824
Solidity          -2.550093
ShapeFactor4      -2.759483
dtype: float64


In [17]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [18]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (10888, 16)
Test size: (2723, 16)


In [20]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Train and evaluate models
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

    print(f"\n {name}")
    print("Accuracy:", round(acc, 4))
    print(classification_report(y_test, y_pred, target_names=le.classes_))



 Logistic Regression
Accuracy: 0.9221
              precision    recall  f1-score   support

    BARBUNYA       0.96      0.89      0.92       265
      BOMBAY       1.00      1.00      1.00       104
        CALI       0.93      0.94      0.93       326
    DERMASON       0.93      0.91      0.92       709
       HOROZ       0.96      0.95      0.96       386
       SEKER       0.93      0.96      0.94       406
        SIRA       0.85      0.88      0.87       527

    accuracy                           0.92      2723
   macro avg       0.94      0.93      0.93      2723
weighted avg       0.92      0.92      0.92      2723


 Decision Tree
Accuracy: 0.895
              precision    recall  f1-score   support

    BARBUNYA       0.90      0.90      0.90       265
      BOMBAY       1.00      1.00      1.00       104
        CALI       0.93      0.91      0.92       326
    DERMASON       0.88      0.89      0.89       709
       HOROZ       0.94      0.93      0.93       386
       

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

# Define stratified k-fold cross-validator
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store cross-validation results
cv_results = {}

for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y_encoded, cv=skf, scoring='accuracy')
    cv_results[name] = scores.mean()
    print(f"ðŸ”¸ {name}:")
    print("  Mean Accuracy:", round(scores.mean(), 4))
    print("  Std Deviation:", round(scores.std(), 4))
    print("  All Fold Scores:", np.round(scores, 4), "\n")


In [None]:

sns.countplot(x=y, palette="viridis")
plt.title("Original Class Distribution")
plt.xticks(rotation=45)
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_encoded)

print("After SMOTE:", dict(pd.Series(y_resampled).value_counts()))


In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_scaled, y_encoded)

print("After Random Undersampling:", dict(pd.Series(y_rus).value_counts()))


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier()
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=le.classes_))


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42),
                       param_grid=param_grid_rf,
                       cv=5, scoring='accuracy', n_jobs=-1)

grid_rf.fit(X_train, y_train)

print("Best Parameters for Random Forest:")
print(grid_rf.best_params_)
print("Best Cross-Validation Accuracy:", round(grid_rf.best_score_, 4))


In [None]:
from scipy.stats import uniform

param_dist_svm = {
    'C': uniform(0.1, 10),
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear']
}

random_svm = RandomizedSearchCV(SVC(),
                                param_distributions=param_dist_svm,
                                n_iter=10,
                                cv=5,
                                scoring='accuracy',
                                random_state=42,
                                n_jobs=-1)

random_svm.fit(X_train, y_train)

print("Best Parameters for SVM:")
print(random_svm.best_params_)
print("Best Cross-Validation Accuracy:", round(random_svm.best_score_, 4))


In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Initialize list
comparison_table = []

# Re-train models
for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    overfit = 'Yes' if (train_acc - test_acc) > 0.05 else 'No'

    comparison_table.append({
        'Model': name,
        'Train Accuracy': round(train_acc, 4),
        'Test Accuracy': round(test_acc, 4),
        'F1 Score': round(f1, 4),
        'Overfitting': overfit
    })

# Create DataFrame
import pandas as pd
comparison_df = pd.DataFrame(comparison_table)

# Identify best model
best_model = comparison_df.loc[comparison_df['F1 Score'].idxmax()]
comparison_df.loc[len(comparison_df.index)] = [
    'Best Model: ' + best_model['Model'],
    best_model['Train Accuracy'],
    best_model['Test Accuracy'],
    best_model['F1 Score'],
    best_model['Overfitting']
]

# Display table
print(comparison_df.to_string(index=False))


In [None]:
| Model                         | Train Accuracy | Test Accuracy | F1 Score | Overfitting |
| ----------------------------- | -------------- | ------------- | -------- | ----------- |
| Logistic Regression           | 0.92           | 0.92          | 0.92     | No          |
| Decision Tree                 | 1.00           | 0.89          | 0.89     | **Yes**     |
| Random Forest                 | 1.00           | 0.91          | 0.91     | No          |
| SVM                           | 0.93           | 0.92          | 0.92     | No          |
| KNN                           | 0.94           | 0.91          | 0.91     | No          |
| Naive Bayes                   | 0.89           | 0.89          | 0.89     | No          |
| AdaBoost                      | 0.63           | 0.63          | 0.57     | No          |
| Gradient Boosting             | 0.96           | 0.92          | 0.92     | No          |
| **Best Model: SVM**           | **0.93**       | **0.92**      | **0.92** | **No**      |


In [None]:
import streamlit as st
import joblib

# For demonstration, we'll train a simple model here (you can remove this in real use)
@st.cache_data
def train_model():
    from sklearn.datasets import load_iris
    iris = load_iris()
    X, y = iris.data, iris.target
    model = SVM()
    model.fit(X, y)
    return model, iris.feature_names, iris.target_names

model, feature_names, class_names = train_model()

# --- Streamlit UI ---
st.title("ðŸ«˜ Bean Classifier App")
st.write("Input the features to classify the bean type.")

# Collect user input
user_input = []
for feature in feature_names:
    val = st.number_input(f"{feature}", min_value=0.0, value=1.0)
    user_input.append(val)

# Predict button
if st.button("Predict Class"):
    input_array = np.array(user_input).reshape(1, -1)
    prediction = model.predict(input_array)[0]
    st.success(f"ðŸŒŸ Predicted Class: **{class_names[prediction]}**")

In [None]:
streamlit run bean_classifier_app.py

In [None]:
import joblib
joblib.dump(model, 'Beans_Multiclass_Classification')
joblib.dump(scaler,'scaler.pkl')