# CODTECH INTERNSHIP
## Task-2: PREDICTIVE ANALYSIS USING MACHINE LEARNING
## Task to be performed are:
### 1. BUILD A MACHINE LEARNING MODEL(E.G., REGRESSION OR CLASSIFICATION) TO PREDICT OUTCOMES BASED ON A DATASET.
### 2. DELIVERABLE: A NOTEBOOK DEMONSTRATING FEATURE SELECTION, MODEL TRAINING, AND EVALUATION.

# Import Libraries

In [29]:
# 1. Import Required Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import tree

# Load Dataset from Seaborn

In [2]:
# 2. Load Dataset
df = sns.load_dataset("iris")
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


# EDA

In [7]:
print(df.shape)

(150, 5)


In [3]:
print(df.tail())

     sepal_length  sepal_width  petal_length  petal_width    species
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica


In [4]:
print(df.describe())

       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


In [8]:
# 3. Encode Target Variable
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])  # Convert species to 0,1,2

In [9]:
# 4. Define Features and Target
X = df.drop("species", axis=1)
y = df["species"]

# Feature Selection

In [17]:
# 5. Feature Selection using ANOVA F-value
selector = SelectKBest(score_func=f_classif, k=3)
selector.fit(X, y)
mask = selector.get_support()
selected_features = X.columns[mask]
print("Selected Features:", list(selected_features))

X_new = X[selected_features] 

Selected Features: ['sepal_length', 'petal_length', 'petal_width']


In [18]:
# Get selected feature names
selected_features = X.columns[selector.get_support()]
print("Selected Features:", list(selected_features))

Selected Features: ['sepal_length', 'petal_length', 'petal_width']


# Splitting the dataset

In [19]:
# 6. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, test_size=0.2, random_state=42)

# KNN Model

In [27]:
# 7. Train KNN Model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [28]:
# 8. Predict & Evaluate
y_pred = knn_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


# Random Forest Model

In [30]:
#9. Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [31]:
#10. Predict and Evaluate
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

