# Linear, kNN - Iris Dataset

## Import Libraries + Data

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns

In [0]:
df = sns.load_dataset('iris')
df.head()

## Null values, Duplicates, Encoding

In [0]:
df.isnull().sum()

In [0]:
df.duplicated().sum()

In [0]:
df.drop_duplicates(inplace = True)
df.duplicated().sum()

In [0]:
df['species'].value_counts()

In [0]:
# encoding data > label encoder, one hot encoder
# label encoder > balanced data
# one hot encoder > imbalanced data

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])
df.head()

In [0]:
df['species'].value_counts()

## Model fitting, Evaluation

In [0]:
X = df.drop('sepal_length', axis = 1)
y = df['sepal_length']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# calculate MSE, R-squared
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}')
print(f'R-squared: {r2}')



In [0]:
# knn model
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}')
print(f'R-squared: {r2}')

Thus, Linear Regression is a better model

In [0]:
# Hyperparameter tuning
# Cross validation
# 

# Random Forest Classifier, Cross Val, Hyperparam Tuning - Iris Dataset

In [0]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

In [0]:
# Step 2: Load Dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.DataFrame(iris.target, columns=['species'])

# Combine for exploration
df = pd.concat([X, y], axis=1)
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
print(df.head())

In [0]:
# Step 3: Exploratory Data Analysis (EDA)
print(df.describe())
print(df['species'].value_counts())

# Pairplot to visualize class separability
sns.pairplot(df, hue='species')
plt.show()

# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

In [0]:
# Step 4: Train-Test Split
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 6: Model Training (Random Forest)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)

# Step 7: Model Evaluation
y_pred = rf.predict(X_test_scaled)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))

# Step 8: Cross Validation
cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5)
print("\nCross-validation Accuracy:", np.mean(cv_scores))

# Step 9: Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [2, 4, 6, None],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

print("\nBest Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Step 10: Final Evaluation
final_pred = best_model.predict(X_test_scaled)
print("\nFinal Model Accuracy:", accuracy_score(y_test, final_pred))




```
# Step 11: Save Model
joblib.dump(best_model, 'iris_model.pkl')
print("\nModel saved as iris_model.pkl")

# Step 12: Load and Test Saved Model
loaded_model = joblib.load('iris_model.pkl')
sample = X_test_scaled[0].reshape(1, -1)
prediction = loaded_model.predict(sample)
print("\nPredicted class for sample:", iris.target_names[prediction[0]])
```