#Task 2

#Internship Report: Foundations and End-to-End Project in Machine Learning

#Full Name: Saad Kabeer
# Phone Number: 03255034664
# Intern ID: ARCH-2505-0211

In [1]:
from sklearn.datasets import fetch_california_housing
# Load data as a pandas DataFrame
housing = fetch_california_housing(as_frame=True)
X = housing.data
y = housing.target
print(X.shape, y.shape)  # Expected output: (20640, 8) (20640,)


(20640, 8) (20640,)


#2

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline # Import the Pipeline class

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

#3

In [5]:
from sklearn.preprocessing import OneHotEncoder

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])


#4

In [6]:
from sklearn.compose import ColumnTransformer

numeric_features = ['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup']
categorical_features = ['Ocean_Proximity']

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
])


#5

In [7]:
X['bedrooms_per_room'] = X['AveBedrms'] / X['AveRooms']
X['pop_per_household']  = X['Population'] / X['AveOccup']


#6

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])


#7

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1. Create dummy data (replace with your actual data loading)
# If you have a CSV, upload it to Colab and read it like this:
# df = pd.read_csv('your_data.csv')
# X = df.drop('target_column', axis=1)
# y = df['target_column']

# For demonstration:
np.random.seed(42)
X = pd.DataFrame(np.random.rand(100, 10), columns=[f'feature_{i}' for i in range(10)])
y = pd.Series(np.random.rand(100) * 100)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Define the pipeline
reg_pipeline = Pipeline([
    ('preprocessor', StandardScaler()), # Standardize features
    ('regressor', RandomForestRegressor(random_state=42)) # The regressor we want to tune
])

# 3. Define the parameter grid
param_grid = {
    'regressor__n_estimators': [50, 100],  # Reduced for faster testing
    'regressor__max_features': [2, 4],     # Reduced for faster testing
    'regressor__max_depth': [None, 10]
}

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("Checking for NaNs in X_train:", X_train.isnull().sum().sum())
print("Checking for NaNs in y_train:", y_train.isnull().sum())

# 4. Perform GridSearchCV
print("Starting GridSearchCV fit...")
grid_search = GridSearchCV(reg_pipeline, param_grid, cv=3, # Reduced cv for faster testing
                           scoring='neg_mean_squared_error',
                           verbose=2, # Add verbose to see progress
                           n_jobs=-1) # Use all available cores for faster computation

try:
    grid_search.fit(X_train, y_train)
    print("GridSearchCV fit complete.")

    # 5. Print best parameters and RMSE
    print("Best params:", grid_search.best_params_)
    print("Best RMSE:", np.sqrt(-grid_search.best_score_))

    # Optional: Evaluate on test set
    y_pred = grid_search.best_estimator_.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Test RMSE:", test_rmse)

except Exception as e:
    print(f"An error occurred during GridSearchCV fit: {e}")

X_train shape: (80, 10)
y_train shape: (80,)
Checking for NaNs in X_train: 0
Checking for NaNs in y_train: 0
Starting GridSearchCV fit...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
GridSearchCV fit complete.
Best params: {'regressor__max_depth': None, 'regressor__max_features': 4, 'regressor__n_estimators': 50}
Best RMSE: 30.58455801199237
Test RMSE: 30.292085196424402
