In [61]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score

In [42]:
# Load the dataset
df = pd.read_csv('Kaggle-Appointment.csv')

In [43]:
#  Handle missing values (if any)
df.fillna(method='ffill', inplace=True)  # Forward fill missing values

In [44]:
# Drop unnecessary columns like PatientId, AppointmentID, and Neighbourhood
df.drop(['PatientId', 'AppointmentID', 'Neighbourhood'], axis=1, inplace=True)

In [45]:
#  Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

In [46]:
# Split the data into features (X) and target variable (y)
X = df.drop('No-show', axis=1)  # Assuming 'No-show' is the target variable
y = df['No-show']

In [47]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
# Scale numerical features (if any)
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['int', 'float']).columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [49]:
# Train Logistic Regression model
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)

In [50]:
# Make predictions using Logistic Regression model
logreg_y_pred = logreg_model.predict(X_test)

In [51]:
# Calculate accuracy of Logistic Regression model
logreg_accuracy = accuracy_score(y_test, logreg_y_pred)
print(f"Logistic Regression Accuracy: {logreg_accuracy}")

Logistic Regression Accuracy: 0.7955306251696372


In [52]:
# Train K-Nearest Neighbors model with adjusted n_neighbors
knn_model = KNeighborsClassifier(n_neighbors=3)  # Adjust n_neighbors as needed
try:
    knn_model.fit(X_train, y_train)
except KeyboardInterrupt:
    print("Training interrupted.")

# Make predictions using K-Nearest Neighbors model
knn_y_pred = knn_model.predict(X_test)

# Calculate accuracy of K-Nearest Neighbors model
knn_accuracy = accuracy_score(y_test, knn_y_pred)
print(f"K-Nearest Neighbors Accuracy: {knn_accuracy}")

K-Nearest Neighbors Accuracy: 0.7548629331403239


In [53]:
# Make predictions using both models on new predictors
new_predictors = pd.DataFrame({
    'Gender': [0, 1],
    'ScheduledDay': ['2024-04-29T00:00:00Z', '2024-04-29T00:00:00Z'],  
    'AppointmentDay': ['2024-04-29T18:38:08Z', '2024-04-29T16:08:27Z'],  
    'Age': [30, 45],
    'Scholarship': [1, 0],
    'Hipertension': [0, 1],
    'Diabetes': [1, 0],
    'Alcoholism': [0, 0],
    'Handcap': [0, 1],
    'SMS_received': [1, 1]
}, columns=['Gender', 'ScheduledDay', 'AppointmentDay', 'Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received'])

In [54]:
# Convert ISO format to Unix timestamp format for model compatibility
new_predictors['AppointmentDay'] = pd.to_datetime(new_predictors['AppointmentDay']).astype(int) / 10**9
new_predictors['ScheduledDay'] = pd.to_datetime(new_predictors['ScheduledDay']).astype(int) / 10**9

In [55]:
# Encode categorical variables in new data
for col in new_predictors.select_dtypes(include=['object']).columns:
    if col in label_encoders:
        new_predictors[col] = label_encoders[col].transform(new_predictors[col])

In [56]:
# Scale numerical features in new data
new_predictors[numerical_cols] = scaler.transform(new_predictors[numerical_cols])

In [57]:
# Make predictions using both models on new data
logreg_new_pred = logreg_model.predict(new_predictors)
knn_new_pred = knn_model.predict(new_predictors)

In [58]:
print("\nPredictions using Logistic Regression:")
print(logreg_new_pred)


Predictions using Logistic Regression:
[1 1]


In [59]:
print("\nPredictions using K-Nearest Neighbors:")
print(knn_new_pred)


Predictions using K-Nearest Neighbors:
[0 0]


These predictions highlight the different approaches and outcomes of the two models. Logistic Regression is a linear model that estimates probabilities, making it suitable for binary classification tasks like predicting appointment show-ups. K-Nearest Neighbors, on the other hand, is a non-linear classifier that classifies data points based on their nearest neighbors, which may lead to different predictions compared to Logistic Regression.

# K CROSS FOLD VALIDATION (K=10)

In [62]:
# Perform 10-fold cross-validation for Logistic Regression
logreg_cv_scores = cross_val_score(logreg_model, X, y, cv=10)
print("Logistic Regression Cross-Validation Scores:")
print(logreg_cv_scores)
print(f"Mean Accuracy: {logreg_cv_scores.mean()}")

Logistic Regression Cross-Validation Scores:
[0.79679725 0.79715914 0.79598299 0.7966163  0.79724962 0.78874514
 0.79924003 0.78248281 0.79207383 0.78483532]
Mean Accuracy: 0.7931182429382544


In [63]:
# Perform 10-fold cross-validation for K-Nearest Neighbors
knn_cv_scores = cross_val_score(knn_model, X, y, cv=10)
print("\nK-Nearest Neighbors Cross-Validation Scores:")
print(knn_cv_scores)
print(f"Mean Accuracy: {knn_cv_scores.mean()}")


K-Nearest Neighbors Cross-Validation Scores:
[0.71989505 0.74206098 0.73608975 0.73708495 0.72279019 0.72197593
 0.75517959 0.74547593 0.76646761 0.76366269]
Mean Accuracy: 0.7410682674950251
