In [3]:
#Week 5 Programs

'''27. Experiment with different test_size values (e.g., 0.2, 0.3, 0.4) and observe model performance on diabetes dataset'''
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Load Diabetes dataset
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
# Function to train the model and evaluate performance
def evaluate_model(test_size):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    # Initialize the model
    model = LinearRegression()
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Calculate the Mean Squared Error (MSE) for model performance
    mse = mean_squared_error(y_test, y_pred)
    return mse
# Experiment with different test_size values: 0.2, 0.3, 0.4
test_sizes = [0.2, 0.3, 0.4]
results = {}
for size in test_sizes:
    mse = evaluate_model(size)
    results[size] = mse
    print(f"Test size: {size}, Mean Squared Error: {mse:.4f}")
# Comparison of model performance
print("\nComparison of Model Performance:")
for size, mse in results.items():
    print(f"Test size: {size} -> MSE: {mse:.4f}")


Test size: 0.2, Mean Squared Error: 2900.1936
Test size: 0.3, Mean Squared Error: 2821.7510
Test size: 0.4, Mean Squared Error: 2832.9962

Comparison of Model Performance:
Test size: 0.2 -> MSE: 2900.1936
Test size: 0.3 -> MSE: 2821.7510
Test size: 0.4 -> MSE: 2832.9962


In [7]:
'''28. Use a fixed random_state value for reproducibility when splitting data, and test with different random_state values. '''
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Load Diabetes dataset
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
# Function to train the model and evaluate performance
def evaluate_model(random_state_value):
    # Split the dataset into training and testing sets with a specific random_state
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state_value)
    # Initialize the model
    model = LinearRegression()
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Calculate the Mean Squared Error (MSE) for model performance
    mse = mean_squared_error(y_test, y_pred)
    return mse
    # Experiment with different random_state values: 42, 0, 100, None
    random_state_values = [42, 0, 100, None]
    results = {}
    for state in random_state_values:
        mse = evaluate_model(state)
        results[state] = mse
        print(f"Random State: {state}, Mean Squared Error: {mse:.4f}")
        # Comparison of model performance for different random states
        print("\nComparison of Model Performance:")
for state, mse in results.items():
    print(f"Random State: {state} -> MSE: {mse:.4f}")


Random State: 0.2 -> MSE: 2900.1936
Random State: 0.3 -> MSE: 2821.7510
Random State: 0.4 -> MSE: 2832.9962


In [8]:
'''29. Clean and preprocess a dataset (handle missing values, scale
features) before splitting into training and test sets'''
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# Load the Diabetes dataset
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.Series(diabetes.target)
# Step 1: Check for missing values
print("Missing values in each feature:")
print(X.isnull().sum())
# Step 2: Handle missing values by imputation (if any)
# For demonstration, let's assume some missing values are introduced randomly
# Introduce missing values randomly in 5% of the data for each feature
np.random.seed(42)
missing_rate = 0.05
n_missing = int(missing_rate * X.size)
missing_indices = np.random.choice(X.size, n_missing, replace=False)
X.values.ravel()[missing_indices] = np.nan
# Impute missing values using the mean strategy
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
# Step 3: Scale the features using Standardization
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns)
# Step 4: Split into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Show the preprocessed data
print("\nPreprocessed Training Features (first 5 rows):")
print(X_train.head())
print("\nPreprocessed Test Features (first 5 rows):")
print(X_test.head())

Missing values in each feature:
age    0
sex    0
bmi    0
bp     0
s1     0
s2     0
s3     0
s4     0
s5     0
s6     0
dtype: int64

Preprocessed Training Features (first 5 rows):
          age       sex       bmi        bp        s1            s2        s3  \
17   1.513010  1.098506  0.257665  1.213660  0.774155  1.148366e-17 -0.843411   
66  -0.193150  1.098506 -0.393807 -0.701413 -0.419833  2.595900e-01 -1.555385   
137  0.117061 -0.964904  1.072006  2.097540 -0.300434 -3.517100e-01 -0.131437   
245 -0.580914 -0.964904 -0.766077 -0.627756 -1.195925 -1.258584e+00  0.659645   
31  -0.503361 -0.964904 -1.417550 -1.732606 -0.807879 -1.151102e+00  1.292511   

           s4            s5            s6  
17   0.771544  5.840213e-01 -2.362735e-17  
66   1.577289  1.002237e-03  4.000295e-01  
137 -0.034202  3.617600e-01 -3.221016e-01  
245 -0.839948 -1.078132e+00 -2.849561e+00  
31  -1.645693  1.924921e-17 -9.539664e-01  

Preprocessed Test Features (first 5 rows):
          age       se

In [16]:
'''30. Compare results when applying feature scaling before or after splitting the data. '''
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_diabetes
from sklearn.metrics import accuracy_score
# Load the diabetes dataset
data = load_diabetes()
X = data.data
y = (data.target > np.median(data.target)).astype(int) # Convert target to binary as 0 or 1
# Function to train and evaluate model with scaling before splitting
def model_with_scaling_before_split(X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Apply feature scaling to the entire dataset before splitting
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X) # Scale the whole dataset
    # Split the scaled data
    X_train_scaled, X_test_scaled = X_scaled[:len(X_train)], X_scaled[len(X_train):]
    # Train the model
    model = LogisticRegression(max_iter=200)
    model.fit(X_train_scaled, y_train)
    # Predict and evaluate the model
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy
# Function to train and evaluate model with scaling after splitting
def model_with_scaling_after_split(X, y):
    # Split the data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Apply feature scaling after splitting
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train) # Scale only the training set
    X_test_scaled = scaler.transform(X_test) # Transform the test set based on training data
    # Train the model
    model = LogisticRegression(max_iter=200)
    model.fit(X_train_scaled, y_train)
    # Predict and evaluate the model
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy
    # Compare the results
accuracy_before_split = model_with_scaling_before_split(X, y)
accuracy_after_split = model_with_scaling_after_split(X, y)
print(f"Accuracy with scaling before splitting: {accuracy_before_split:.4f}")
print(f"Accuracy with scaling after splitting: {accuracy_after_split:.4f}")

Accuracy with scaling before splitting: 0.5169
Accuracy with scaling after splitting: 0.7303


In [14]:
##Assessment 3:
'''Write a Python program using Scikit-learn to split the iris dataset into 70% train data and 30% test data. Out of total 150 records, the training set will contain 120 records and the test set contains 30 of those records. Print both datasets.
(Students only must write the program for this Assessment Question and should execute in Lab)'''

'Write a Python program using Scikit-learn to split the iris dataset into 70% train data and 30% test data. Out of total 150 records, the training set will contain 120 records and the test set contains 30 of those records. Print both datasets.\n(Students only must write the program for this Assessment Question and should execute in Lab)'