Title: Data Splitting (Train-Test-Validation)


Task 1: House Prices Dataset (Regression)<br>
Use the House Prices dataset to predict house prices.<br>
Split the data into training, validation, and test sets (70% train, 15% validation, 15% test).

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create sample house price dataset
np.random.seed(42)
data_size = 1000
square_feet = np.random.randint(500, 4000, size=data_size)
bedrooms = np.random.randint(1, 6, size=data_size)
price = square_feet * 150 + bedrooms * 10000 + np.random.normal(0, 30000, data_size)

df = pd.DataFrame({
    'SquareFeet': square_feet,
        'Bedrooms': bedrooms,
            'Price': price
            })

# Split into features and target
X = df[['SquareFeet', 'Bedrooms']]
y = df['Price']

# Split into train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split temp into validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Fit model on training set
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate on validation and test sets
val_preds = model.predict(X_val)
test_preds = model.predict(X_test)

val_mse = mean_squared_error(y_val, val_preds)
test_mse = mean_squared_error(y_test, test_preds)

print(f"Validation MSE: {val_mse:.2f}")
print(f"Test MSE: {test_mse:.2f}")

Validation MSE: 940174943.61
Test MSE: 923477962.88


Task 2: Iris Dataset (Classification)<br>
Apply data splitting to the Iris dataset.<br>
Split it into train (70%), validation (15%), and test (15%).


In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split into train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Split temp into validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Train model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict and evaluate
val_preds = model.predict(X_val)
test_preds = model.predict(X_test)

val_acc = accuracy_score(y_val, val_preds)
test_acc = accuracy_score(y_test, test_preds)

print(f"Validation Accuracy: {val_acc:.2f}")
print(f"Test Accuracy: {test_acc:.2f}")


Validation Accuracy: 0.86
Test Accuracy: 1.00



Task 3: Customer Churn Dataset (Classification)<br>
Predict customer churn using the telecom dataset.<br>
Split the data into training, validation, and test sets.

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create sample telecom churn dataset
np.random.seed(42)
data_size = 1000
tenure = np.random.randint(1, 72, size=data_size)
monthly_charges = np.random.uniform(20, 120, size=data_size)
total_charges = tenure * monthly_charges + np.random.normal(0, 50, size=data_size)
churn = np.random.choice([0, 1], size=data_size, p=[0.75, 0.25])

df = pd.DataFrame({
    'Tenure': tenure,
        'MonthlyCharges': monthly_charges,
            'TotalCharges': total_charges,
                'Churn': churn
                })

# Features and target
X = df[['Tenure', 'MonthlyCharges', 'TotalCharges']]
y = df['Churn']

# Split into train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Split temp into validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
val_preds = model.predict(X_val)
test_preds = model.predict(X_test)

val_acc = accuracy_score(y_val, val_preds)
test_acc = accuracy_score(y_test, test_preds)

print(f"Validation Accuracy: {val_acc:.2f}")
print(f"Test Accuracy: {test_acc:.2f}")

Validation Accuracy: 0.67
Test Accuracy: 0.73
