Title: Introduction to Scikit-Learn & Machine Learning Models

Task 1: Installing and Setting Up Scikit-Learn

In [1]:
pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



Task 2: Loading in-built Datasets

In [2]:
from sklearn import datasets

# Load the Iris dataset
iris = datasets.load_iris()

# Display the first few rows
print(iris.data[:5])

# Print the feature names
print(iris.feature_names)

# Print the target names
print(iris.target_names)



[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


Task 3: Understanding Data Structures

In [3]:
from sklearn import datasets
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

# Load the Iris dataset
iris = datasets.load_iris()

# 1. Create an array from the dataset
iris_array = np.array(iris.data)
print("Array:\n", iris_array[:5])

# 2. Create a DataFrame from the dataset
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
print("\nDataFrame:\n", iris_df.head())

# 3. Create a CSR Matrix
iris_csr = csr_matrix(iris.data)
print("\nCSR Matrix:\n", iris_csr[:5])


Array:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]

DataFrame:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

CSR Matrix:
 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 20 stored elements and shape (5, 4)>
  Coords	Values
  (0, 0)	5.1
  (0, 1)	3.5
  (0, 2)	1.4
  (0, 3)	0.2
  (1, 0)	4.9
  (1, 1)	3.0
  (1, 2)	1.4
  (1, 3)	0.2
  (2, 0)	4.7
  (2, 1)	3.2
  (2, 2)	1.3
  (2, 3)	0.2
  (3, 0)	4.6
  (3, 1)	3.1
  (3, 2)	1.5
  (3, 3)	0.2
  (4, 0)	5.0
  (4, 1)	3.6
  (4, 2)	1.4
  (4, 3)	0.2


Title: Building a Simple ML Model in Scikit-Learn

Task 1: Simple Linear Regression
Implement linear regression with a small dataset

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Step 1: Create a simple dataset
data = {
    'X': [1, 2, 3, 4, 5],
        'y': [2, 4, 5, 4, 5]
        }
df = pd.DataFrame(data)

# Step 2: Split the dataset into training and testing sets
X = df[['X']]  # Features
y = df['y']  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.7346938775510206


Task 2: Decision Tree Classifier
Build a decision tree model with the Iris dataset:

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Step 1: Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


Task 3: K-Nearest Neighbors Classifier
Use the KNN algorithm on the digits dataset:

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score

# Step 1: Load the digits dataset
digits = load_digits()
X = digits.data
y = digits.target

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a K-Nearest Neighbors classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = knn.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9833333333333333


Title: Training a Classification Model

Task 1: Logistic Regression
Train a logistic regression model

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Step 1: Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Logistic Regression model
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = logreg.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


Task 2: Support Vector Machine
Train a Support Vector Classifier on the Iris dataset

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Step 1: Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Support Vector Classifier (SVC)
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = svc.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


Task 3: Naive Bayes Classifier
Train a Gaussian Naive Bayes model

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Step 1: Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Gaussian Naive Bayes model
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = gnb.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


Title: Understanding Model Performance & Hyperparameter Tuning

Task 1: Using Confusion Matrix
Evaluate a model with a confusion matrix:

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, accuracy_score

# Step 1: Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = rf.predict(X_test)

# Step 5: Evaluate the model using confusion matrix
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Display the confusion matrix and accuracy
print("Confusion Matrix:\n", cm)
print("Accuracy:", accuracy)


Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Accuracy: 1.0


Task 2: Cross-validation Score
Perform cross-validation with k-fold:

In [11]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Step 1: Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Step 2: Initialize the model
rf = RandomForestClassifier(random_state=42)

# Step 3: Perform cross-validation with 5 folds
cv_scores = cross_val_score(rf, X, y, cv=5)

# Step 4: Display the cross-validation scores and the mean score
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.96666667 0.96666667 0.93333333 0.96666667 1.        ]
Mean cross-validation score: 0.9666666666666668


Task 3: Hyperparameter Tuning using Grid Search
Optimize hyperparameters using GridSearchCV

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Step 1: Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Step 2: Initialize the model
rf = RandomForestClassifier(random_state=42)

# Step 3: Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
            }

# Step 4: Set up the GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Step 5: Fit the model using GridSearchCV
grid_search.fit(X, y)

# Step 6: Display the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation score: 0.9666666666666668
