#### Loading the noshowappointments dataset from Kaggle and loading it onto a Pandas dataframe

In [None]:
import kagglehub
path = kagglehub.dataset_download("joniarroba/noshowappointments")
print("path to dataset files -> {}".format(path))

In [None]:
import os
print(os.listdir(path))

In [None]:
import pandas as pd
files = os.listdir(path)
csv_file = [f for f in files if f.endswith('.csv')][0]
fullpath = os.path.join(path, csv_file)
noshow_dataset_df = pd.read_csv(fullpath)

#### Data exploratrion

In [None]:
noshow_dataset_df.head(4)

In [None]:
noshow_dataset_df.info()

In [None]:
noshow_dataset_df.describe()

#### Removal of features than cannot affect the outcome(i.e., whether the patient is a no show) in any way

In [None]:
noshow_dataset_df1 = noshow_dataset_df.drop(['PatientId', 'AppointmentID'], axis=1)

In [None]:
noshow_dataset_df1.describe()

In [None]:
noshow_dataset_df1.head()

In [None]:
noshow_dataset_df1.tail(100)

## converting all the columns to numeric datatypes

male values denoted by 'M' in the Gender column are encoded as 0 and female → 'F' encoded as 1

In [None]:
noshow_dataset_df1['Gender'] = noshow_dataset_df1['Gender'].apply(lambda x: 1 if x == 'F' else 0)

In [None]:
noshow_dataset_df1.keys()

Encoding of the No-show column:
 'Yes' encoded as 1
 'No' encoded as 0

In [None]:
noshow_dataset_df1['No-show'] = noshow_dataset_df1['No-show'].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
noshow_dataset_df1.describe()

In [None]:
noshow_dataset_df1.info()

In [None]:
noshow_dataset_df1[['ScheduledDay', 'AppointmentDay']].tail(100)

## Feature engineering

In the cell below, the duration, in days, between schedule day and appointment day is derived and stored in a column

In [None]:
noshow_dataset_df1[['AppointmentDay', 'ScheduledDay']] = noshow_dataset_df1[['AppointmentDay', 'ScheduledDay']].apply(pd.to_datetime)
noshow_dataset_df1['WaitingDays'] = (noshow_dataset_df1['AppointmentDay'] - noshow_dataset_df1['ScheduledDay']).dt.days

In [None]:
noshow_dataset_df1['WaitingDays'].head(10000)

In [None]:
negative_wait = noshow_dataset_df1[noshow_dataset_df1['WaitingDays'] < 0]
print(f"Records with negative waiting time: {len(negative_wait)}")
print(f"Percentage of total: {len(negative_wait)/len(noshow_dataset_df1)*100:.2f}%")

In [None]:
noshow_dataset_df1[noshow_dataset_df1['WaitingDays'] == -1].sample(n=10, replace=True)

In [None]:
noshow_dataset_df1[(noshow_dataset_df1['WaitingDays'] == -1) & (noshow_dataset_df1['No-show'] == 1)].sample(n=5)

In [None]:
print(len(noshow_dataset_df[(noshow_dataset_df1['WaitingDays'] == -1) & (noshow_dataset_df1['No-show'] == 1)])/ len(negative_wait) * 100)

some statistics,
    37% of records show negative waiting times
    of that 4% are no show appointments
conclusion:
    initially thought negative waiting times are a proxy for walk-in appointments, however, 4% no show subset disproves this, and
    now it is assumed that they are errors in data collections.

### issuing an integer id for each unique neighborhood

In [None]:
print(noshow_dataset_df1['Neighbourhood'].nunique())

80 unique neighborhoods to be converted into 80 integer ids

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
noshow_dataset_df1['Neighbourhood'] = le.fit_transform(noshow_dataset_df1['Neighbourhood'])
print("Neighbourhood lookup table:\n")
for i,class_name in enumerate(le.classes_):
    print("ID: {}, Neighborhood name: {}".format(i, class_name))

In [None]:
noshow_dataset_df1.head(5)

extracting the day of the week and month for the appointment

In [None]:
noshow_dataset_df1['appointment_day_of_week'] = noshow_dataset_df1['AppointmentDay'].dt.day_name()
noshow_dataset_df1['appointment_month'] = noshow_dataset_df1['AppointmentDay'].dt.month

encoding the day of the week

In [None]:
# For KNN, you might want numerical encoding
day_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
               'Friday': 4, 'Saturday': 5, 'Sunday': 6}
noshow_dataset_df1['day_of_week_encoded'] = noshow_dataset_df1['appointment_day_of_week'].map(day_mapping)

In [None]:
noshow_dataset_df1['appointment_month'].sample(10000)

In [None]:
noshow_dataset_df1[['appointment_month','AppointmentDay']].sample(20)

### adding the hypertension vs age feature interaction

In [None]:
noshow_dataset_df1["age_hypertension_interraction"] = noshow_dataset_df1["Age"] * noshow_dataset_df1["Hipertension"]

creating a feature for if the appointment day is a weekend

In [None]:
noshow_dataset_df1["is_weekend"] = noshow_dataset_df1["day_of_week_encoded"].apply(lambda x: 1 if (x == 5) or (x == 6) else 0)

In [None]:
noshow_dataset_df1[noshow_dataset_df1["is_weekend"] == 1].count()

adding is_Friday boolean feature to the data

In [None]:
noshow_dataset_df1["is_friday"] = noshow_dataset_df1["day_of_week_encoded"].apply(lambda x: 1 if x == 4 else 0)
noshow_dataset_df1[(noshow_dataset_df1["is_friday"] == 1)].sample(10)

age categorization

In [None]:
noshow_dataset_df1["age_group"] = pd.cut(noshow_dataset_df1["Age"],
                                         bins=[0, 12, 19, 35, 50, 65, 100],
                                         labels=['child', 'teen', 'young_adult', 'adult', 'middle_aged', 'senior'])

In [None]:
noshow_dataset_df1.sample(5)

In [None]:
age_group_dummies = pd.get_dummies(noshow_dataset_df1["age_group"], prefix='age_group', dtype=int)
age_group_dummies

In [None]:
noshow_dataset_df1 = pd.concat([noshow_dataset_df1, age_group_dummies], axis=1)

In [None]:
import numpy as np
# Convert to Unix timestamp (seconds)
noshow_dataset_df1['ScheduledDay'] = noshow_dataset_df1['ScheduledDay'].astype(np.int64) // 10**9
noshow_dataset_df1['AppointmentDay'] = noshow_dataset_df1['AppointmentDay'].astype(np.int64) // 10**9


In [None]:
final_df = noshow_dataset_df1.drop(['appointment_day_of_week', 'age_group'], axis=1)
print(final_df.columns)
noshow_dataset_df1["AppointmentDay"].dtype

In [None]:
import numpy as np
# Convert to Unix timestamp (seconds)
noshow_dataset_df1['ScheduledDay'] = noshow_dataset_df1['ScheduledDay'].astype(np.int64) // 10**9
noshow_dataset_df1['AppointmentDay'] = noshow_dataset_df1['AppointmentDay'].astype(np.int64) // 10**9


In [None]:
final_df.sample(4)

## Model Training and Evaluation

initial model training

In [None]:
X = final_df[['Gender', 'ScheduledDay', 'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'WaitingDays', 'appointment_month','day_of_week_encoded', 'age_hypertension_interraction', 'is_weekend',
       'is_friday', 'age_group_child', 'age_group_teen',
       'age_group_young_adult', 'age_group_adult', 'age_group_middle_aged',
       'age_group_senior']]
y = final_df['No-show']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
kcl = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, algorithm='ball_tree').fit(X_train,y_train)

In [None]:
print(kcl.score(X_test, y_test))

parameter fine tuning
            → finding the ideal number of n_neighbours to create a model thats 70-90% accurate

# ⚠️ PERFORMANCE WARNING: This cell performs hyperparameter tuning over 99 iterations and may take several minutes to complete

In [None]:
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

testing_accuracy = []
neighbours = range(1, 100, 1)

for n in tqdm(neighbours, desc="Testing neighbors"):
    kcl = KNeighborsClassifier(n_neighbors=n, n_jobs=-1, algorithm="ball_tree").fit(X_train, y_train)
    acc = kcl.score(X_test, y_test)
    testing_accuracy.append(acc)

plt.plot(neighbours, testing_accuracy, label='testing_accuracy')
plt.ylabel('Accuracy score')
plt.xlabel('n_neighbors')
plt.legend()
plt.show()


In [None]:
print(testing_accuracy)

In [None]:
# =============================================
# CONCRETE COMPRESSIVE STRENGTH DATASET - KNN REGRESSION
# =============================================

print("=" * 60)
print("CONCRETE COMPRESSIVE STRENGTH ANALYSIS")
print("=" * 60)

In [None]:
# Load Concrete Compressive Strength dataset from UCI
print("Loading Concrete Compressive Strength dataset...")
from ucimlrepo import fetch_ucirepo
concrete_data = fetch_ucirepo(id=165)

# Extract features and target
concrete_features = concrete_data.data.features
concrete_target = concrete_data.data.targets

# Create DataFrame
concrete_df = pd.DataFrame(concrete_features)
concrete_df['compressive_strength'] = concrete_target

print(f"Concrete dataset shape: {concrete_df.shape}")
print("Concrete dataset loaded successfully!")

In [None]:
# 1. Basic Information
print("1. DATASET OVERVIEW")
print("=" * 40)
print(f"Dataset shape: {concrete_df.shape}")
print("\nFirst 5 rows:")
print(concrete_df.head())

print("\nDataset info:")
print(concrete_df.info())

print("\nBasic statistics:")
print(concrete_df.describe())

In [None]:
# 2. Check for missing values
print("\n2. DATA QUALITY CHECK")
print("=" * 40)
print("Missing values:")
print(concrete_df.isnull().sum())

print("\nDuplicate rows:", concrete_df.duplicated().sum())

In [None]:
# 3. Visualize distributions
import matplotlib.pyplot as plt
import seaborn as sns

print("\n3. FEATURE DISTRIBUTIONS")
print("=" * 40)

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

features = ['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water',
           'Superplasticizer', 'Coarse Aggregate', 'Fine Aggregate', 'Age']

for i, feature in enumerate(features):
    axes[i].hist(concrete_df[feature], bins=30, alpha=0.7, color='skyblue')
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')

axes[-1].set_visible(False)

plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
plt.hist(concrete_df['compressive_strength'], bins=30, alpha=0.7, color='lightgreen')
plt.title('Distribution of Compressive Strength (Target Variable)')
plt.xlabel('Compressive Strength (MPa)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 4. Correlation Analysis
print("\n4. CORRELATION ANALYSIS")
print("=" * 40)

plt.figure(figsize=(12, 8))
correlation_matrix = concrete_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

target_correlations = correlation_matrix['compressive_strength'].sort_values(ascending=False)
print("\nCorrelations with Compressive Strength:")
print(target_correlations)

In [None]:
# 5. DATA PREPROCESSING
print("\n5. DATA PREPROCESSING")
print("=" * 40)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_concrete = concrete_df.drop('compressive_strength', axis=1)
y_concrete = concrete_df['compressive_strength']

print(f"Features shape: {X_concrete.shape}")
print(f"Target shape: {y_concrete.shape}")

X_train_conc, X_test_conc, y_train_conc, y_test_conc = train_test_split(
    X_concrete, y_concrete, test_size=0.2, random_state=42
)

print(f"Training set: {X_train_conc.shape}")
print(f"Testing set: {X_test_conc.shape}")

scaler = StandardScaler()
X_train_conc_scaled = scaler.fit_transform(X_train_conc)
X_test_conc_scaled = scaler.transform(X_test_conc)

print("Feature scaling completed!")

In [None]:
# 6. KNN REGRESSION MODEL
print("\n6. KNN REGRESSION MODEL DEVELOPMENT")
print("=" * 40)

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train_conc_scaled, y_train_conc)

y_pred_conc = knn_reg.predict(X_test_conc_scaled)

mse = mean_squared_error(y_test_conc, y_pred_conc)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_conc, y_pred_conc)
r2 = r2_score(y_test_conc, y_pred_conc)

print("Initial KNN Regression Results (k=5):")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")

In [None]:
# 7. HYPERPARAMETER TUNING
print("\n7. HYPERPARAMETER TUNING")
print("=" * 40)

k_values = range(1, 31)
train_scores = []
test_scores = []

for k in k_values:
    knn_temp = KNeighborsRegressor(n_neighbors=k)
    knn_temp.fit(X_train_conc_scaled, y_train_conc)

    train_pred = knn_temp.predict(X_train_conc_scaled)
    test_pred = knn_temp.predict(X_test_conc_scaled)

    train_scores.append(r2_score(y_train_conc, train_pred))
    test_scores.append(r2_score(y_test_conc, test_pred))

plt.figure(figsize=(12, 6))
plt.plot(k_values, train_scores, 'o-', label='Training R²')
plt.plot(k_values, test_scores, 'o-', label='Testing R²')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('R² Score')
plt.title('KNN Regression: Finding Optimal k')
plt.legend()
plt.grid(True)
plt.show()

best_k_index = np.argmax(test_scores)
best_k = k_values[best_k_index]
best_score = test_scores[best_k_index]

print(f"Optimal k value: {best_k}")
print(f"Best R² Score: {best_score:.4f}")

In [None]:
# 8. FINAL OPTIMIZED MODEL
print("\n8. FINAL OPTIMIZED KNN REGRESSION MODEL")
print("=" * 40)

final_knn_reg = KNeighborsRegressor(n_neighbors=best_k)
final_knn_reg.fit(X_train_conc_scaled, y_train_conc)

final_pred = final_knn_reg.predict(X_test_conc_scaled)

final_mse = mean_squared_error(y_test_conc, final_pred)
final_rmse = np.sqrt(final_mse)
final_mae = mean_absolute_error(y_test_conc, final_pred)
final_r2 = r2_score(y_test_conc, final_pred)

print(f"Optimal k: {best_k}")
print("Final Model Performance:")
print(f"Mean Squared Error (MSE): {final_mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {final_rmse:.4f}")
print(f"Mean Absolute Error (MAE): {final_mae:.4f}")
print(f"R² Score: {final_r2:.4f}")

plt.figure(figsize=(10, 6))
plt.scatter(y_test_conc, final_pred, alpha=0.6)
plt.plot([y_test_conc.min(), y_test_conc.max()], [y_test_conc.min(), y_test_conc.max()], 'r--', lw=2)
plt.xlabel('Actual Compressive Strength')
plt.ylabel('Predicted Compressive Strength')
plt.title(f'KNN Regression: Actual vs Predicted (k={best_k})')
plt.show()

residuals = y_test_conc - final_pred
plt.figure(figsize=(10, 6))
plt.scatter(final_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:
# =============================================
# COMPARISON AND CONCLUSIONS
# =============================================

print("=" * 60)
print("MODEL COMPARISON AND CONCLUSIONS")
print("=" * 60)

print("\n📊 KNN CLASSIFICATION (No-Show Appointments)")
print(f"Best Accuracy: {max(testing_accuracy):.4f}")
print("Task: Binary classification (show/no-show)")
print("Key Features: Patient demographics, appointment details")

print("\n📈 KNN REGRESSION (Concrete Strength)")
print(f"Best R² Score: {final_r2:.4f}")
print(f"Best RMSE: {final_rmse:.4f}")
print("Task: Regression (predict compressive strength)")
print("Key Features: Concrete ingredient proportions, age")

print("\n🔑 KEY INSIGHTS:")
print("1. KNN performs well for both classification and regression tasks")
print("2. Feature scaling is crucial for KNN regression")
print("3. Optimal k varies between datasets and tasks")
print("4. Model interpretability is a strength of KNN")
print("5. Computational cost increases with dataset size")