Title: Model Selection

Task 1: Linear Regression on House Prices<br>
Use Linear Regression and evaluate its performance on the validation set.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create sample house price dataset
np.random.seed(42)
data_size = 1000
square_feet = np.random.randint(500, 4000, size=data_size)
bedrooms = np.random.randint(1, 6, size=data_size)
price = square_feet * 200 + bedrooms * 15000 + np.random.normal(0, 25000, data_size)

df = pd.DataFrame({
    'SquareFeet': square_feet,
        'Bedrooms': bedrooms,
            'Price': price
            })


# Split data
X = df[['SquareFeet', 'Bedrooms']]
y = df['Price']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate on validation set
val_preds = model.predict(X_val)
val_mse = mean_squared_error(y_val, val_preds)

print(f"Validation MSE: {val_mse:.2f}")

Validation MSE: 652899266.40


Task 2: Decision Tree Classifier on Iris Dataset<br>
Train a Decision Tree model and evaluate its performance on validation data.

In [2]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Train model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate on validation set
val_preds = model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)

print(f"Validation Accuracy: {val_acc:.2f}")


Validation Accuracy: 0.95


Task 3:  Random Forest on Customer Churn<br>
Apply Random Forest and assess its accuracy on the validation set.

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create sample churn dataset
np.random.seed(42)
data_size = 1000
tenure = np.random.randint(1, 72, size=data_size)
monthly_charges = np.random.uniform(20, 120, size=data_size)
total_charges = tenure * monthly_charges + np.random.normal(0, 50, size=data_size)
churn = np.random.choice([0, 1], size=data_size, p=[0.7, 0.3])

df = pd.DataFrame({
    'Tenure': tenure,
        'MonthlyCharges': monthly_charges,
            'TotalCharges': total_charges,
                'Churn': churn
                })

# Features and target
X = df[['Tenure', 'MonthlyCharges', 'TotalCharges']]
y = df['Churn']

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate on validation set
val_preds = model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)

print(f"Validation Accuracy: {val_acc:.2f}")

Validation Accuracy: 0.66
