In [160]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#### a. Data cleaning


In [None]:
air_df = pd.read_csv("AirQuality.csv")
air_df.head()

In [None]:
air_df.info()

In [None]:
air_df.shape

In [None]:
heart_df = pd.read_csv("heartdisease.csv")
heart_df.head()

In [None]:
heart_df.info()

In [None]:
heart_df.shape

In [168]:
# Removing null values
air_df.dropna(inplace=True)
heart_df.dropna(inplace=True)

In [169]:
# Removing duplicates
air_df.drop_duplicates(inplace=True)
heart_df.drop_duplicates(inplace=True)

In [None]:
air_df.shape

In [None]:
heart_df.shape

In [None]:
# Checking for missing values
air_df.isnull().sum()

In [None]:
heart_df.isnull().sum()

#### b. Data Integration


In [None]:
# Create artificial IDs for integration purpose
air_df['ID'] = range(1, len(air_df) + 1)
heart_df['ID'] = range(1, len(heart_df) + 1)

# Merge on the ID column (inner join for same size, or left/right based on use-case)
integrated_df = pd.merge(air_df, heart_df, on='ID', how='inner')

# Checking the result
print("Integrated Dataset Shape:", integrated_df.shape)
integrated_df.head()

#### c. Data Transformation


In [None]:
# Check if any columns are still non-numeric
non_numeric_cols = integrated_df.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_cols.tolist())


In [179]:
# Step 1: Drop Date and Time
if 'Date' in integrated_df.columns:
    integrated_df.drop(columns=['Date'], inplace=True)
if 'Time' in integrated_df.columns:
    integrated_df.drop(columns=['Time'], inplace=True)

In [180]:
# Step 2: Encode 'ca' and 'thal'
label_enc = LabelEncoder()
for col in ['ca', 'thal']:
    if col in integrated_df.columns:
        integrated_df[col] = label_enc.fit_transform(integrated_df[col].astype(str))

In [None]:
integrated_df.head()

In [None]:
# Step 4: Re-scale
scaler = StandardScaler()
transformed_array = scaler.fit_transform(integrated_df)
transformed_df = pd.DataFrame(transformed_array, columns=integrated_df.columns)

transformed_df.head()

In [None]:
transformed_df.info()

#### d. Error Correcting


In [None]:
# Step 1: Check for NaN or Infinite values
print("NaNs in dataset:", transformed_df.isnull().sum().sum())
print("Infinite values:", np.isinf(transformed_df).sum().sum())

In [None]:
# Step 2: Z-score outlier detection
z_scores = np.abs((transformed_df - transformed_df.mean()) / transformed_df.std())
outliers = (z_scores > 3).sum().sum()
print("Total potential outlier values:", outliers)

In [187]:
transformed_df = transformed_df.clip(lower=-3, upper=3)

In [None]:
# Reset index after corrections
transformed_df.reset_index(drop=True, inplace=True)

print("Error correction completed. Data is clean and model-ready.")
transformed_df.head()

In [None]:
transformed_df.shape

#### e. Data Model Building


In [191]:
# Step 1: Features and Target
X = transformed_df.iloc[:, :-1]
y = transformed_df.iloc[:, -1]

In [192]:
# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Step 3: Initialize and Train Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [195]:
# Step 4: Predict
y_pred = model.predict(X_test)

In [None]:
# Step 5: Evaluation
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

In [None]:
print("✅ Model Evaluation (Regression):")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")