In [1]:
# steps =>
#1. import data
#2 data cleaning ->inconsistencies, missing values
#3.EDA
#4. Outlier detection and removal
#5. Data balancing / resampling
#6. Feature Selection / Dimensionality reduction
#7. Feature SCaling
#8. Cross Validation
#9. Algorithm and hyper tuning
#10.Model Builging
#11. Predictions
#12. Deployment

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import streamlit as st

In [3]:
pip install imbalanced-learn





In [4]:
df = pd.read_excel('Rice_Cammeo_Osmancik.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: 'Rice_Cammeo_Osmancik.xlsx'

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated()

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.strip()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Step 2: Data Cleaning

# Convert all numeric columns (except 'Class') to proper numerical types
for col in df.columns:
    if col != "Class":  # Keep 'Class' as a category
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert, set errors to NaN


df.dropna(inplace=True)


print(df.dtypes)

In [None]:
df

In [None]:
x = df.drop(columns=["Class"])
y = df["Class"]

In [None]:
y.value_counts()

In [None]:
sns.boxplot(x)

In [None]:
sns.pairplot(df, hue="Class")
plt.show()

In [None]:
sns.heatmap(x.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Step 4: Outlier Detection & Removal
Q1 = x.quantile(0.25)  
Q3 = x.quantile(0.75)
IQR = Q3 - Q1

# Create a mask for outliers
outlier_mask = (x < (Q1 - 1.5 * IQR)) | (x > (Q3 + 1.5 * IQR))

# Remove rows with outliers
x = x[~outlier_mask.any(axis=1)]
y = y.loc[x.index]  # Ensure target variable stays aligned


In [None]:
sns.countplot(x = y)

In [None]:
sns.boxplot(x)

In [None]:
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x, y)

In [None]:
# Step 6: Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=5)
x_selected = selector.fit_transform(x_resampled, y_resampled)

In [None]:
# Step 7: Feature Scaling
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_selected)

In [None]:
# Step 8: Cross Validation
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Step 9: Algorithm Selection & Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
param_grid = {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(x_train, y_train)

In [None]:
# Step 10: Model Building
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(x_test)

In [None]:
# Step 11: Predictions & Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.show()

In [None]:
# Step 12: Deployment (Example Serialization)
import joblib
joblib.dump(best_rf, "rice_model.pkl")
joblib.dump(scaler, "scaler.pkl")
