In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

from sklearn.model_selection import cross_val_score

In [2]:
# Load the dataset
df = pd.read_csv('Kaggle-Appointment.csv')

In [5]:
print(df.columns)

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show'],
      dtype='object')


In [6]:
print(df.head())

      PatientId  AppointmentID Gender          ScheduledDay  \
0  2.987250e+13        5642903      F  2016-04-29T18:38:08Z   
1  5.589978e+14        5642503      M  2016-04-29T16:08:27Z   
2  4.262962e+12        5642549      F  2016-04-29T16:19:04Z   
3  8.679512e+11        5642828      F  2016-04-29T17:29:31Z   
4  8.841186e+12        5642494      F  2016-04-29T16:07:23Z   

         AppointmentDay  Age      Neighbourhood  Scholarship  Hipertension  \
0  2016-04-29T00:00:00Z   62    JARDIM DA PENHA            0             1   
1  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             0   
2  2016-04-29T00:00:00Z   62      MATA DA PRAIA            0             0   
3  2016-04-29T00:00:00Z    8  PONTAL DE CAMBURI            0             0   
4  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             1   

   Diabetes  Alcoholism  Handcap  SMS_received No-show  
0         0           0        0             0      No  
1         0           0        0      

In [3]:
print(df.tail())

           PatientId  AppointmentID Gender          ScheduledDay  \
110522  2.572134e+12        5651768      F  2016-05-03T09:15:35Z   
110523  3.596266e+12        5650093      F  2016-05-03T07:27:33Z   
110524  1.557663e+13        5630692      F  2016-04-27T16:03:52Z   
110525  9.213493e+13        5630323      F  2016-04-27T15:09:23Z   
110526  3.775115e+14        5629448      F  2016-04-27T13:30:56Z   

              AppointmentDay  Age Neighbourhood  Scholarship  Hipertension  \
110522  2016-06-07T00:00:00Z   56   MARIA ORTIZ            0             0   
110523  2016-06-07T00:00:00Z   51   MARIA ORTIZ            0             0   
110524  2016-06-07T00:00:00Z   21   MARIA ORTIZ            0             0   
110525  2016-06-07T00:00:00Z   38   MARIA ORTIZ            0             0   
110526  2016-06-07T00:00:00Z   54   MARIA ORTIZ            0             0   

        Diabetes  Alcoholism  Handcap  SMS_received No-show  
110522         0           0        0             1      No 

In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB
None


In [8]:
# Display summary statistics
print(df.describe())

          PatientId  AppointmentID            Age    Scholarship  \
count  1.105270e+05   1.105270e+05  110527.000000  110527.000000   
mean   1.474963e+14   5.675305e+06      37.088874       0.098266   
std    2.560949e+14   7.129575e+04      23.110205       0.297675   
min    3.921784e+04   5.030230e+06      -1.000000       0.000000   
25%    4.172614e+12   5.640286e+06      18.000000       0.000000   
50%    3.173184e+13   5.680573e+06      37.000000       0.000000   
75%    9.439172e+13   5.725524e+06      55.000000       0.000000   
max    9.999816e+14   5.790484e+06     115.000000       1.000000   

        Hipertension       Diabetes     Alcoholism        Handcap  \
count  110527.000000  110527.000000  110527.000000  110527.000000   
mean        0.197246       0.071865       0.030400       0.022248   
std         0.397921       0.258265       0.171686       0.161543   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       

In [3]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


In [4]:
# Check the distribution of the target variable 'No-show'
print("Distribution of 'No-show':\n", df['No-show'].value_counts())

Distribution of 'No-show':
 No-show
No     88208
Yes    22319
Name: count, dtype: int64


In [5]:
# Data Preprocessing
# Convert 'No-show' column to binary (1 for 'Yes' and 0 for 'No')
df['No-show'] = df['No-show'].apply(lambda x: 1 if x == 'Yes' else 0)

In [6]:
# Drop unnecessary columns like PatientId, AppointmentID, and Neighbourhood
df.drop(['PatientId', 'AppointmentID', 'Neighbourhood'], axis=1, inplace=True)

In [7]:
# # Save the cleaned and clustered dataset to a new CSV file
df.to_csv('Kaggle-Appointment_cleaned.csv', index=False)

In [13]:
# Handle categorical variables
label_encoders = {}


for col in df.select_dtypes(include=['object']).columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

In [14]:
# Split the dataset into features and target variable
X = df.drop('No-show', axis=1)
y = df['No-show']

In [15]:
# Split the df into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
linear_feature_names = X_train.columns.tolist()
print(linear_feature_names)

['Gender', 'ScheduledDay', 'AppointmentDay', 'Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']


# Model 1: Linear Regression

In [17]:
# Model 1: Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_y_pred = linear_model.predict(X_test)
linear_r2 = r2_score(y_test, linear_y_pred)

In [18]:
print(f"Linear Regression R-squared: {linear_r2}")

Linear Regression R-squared: 0.048357483667870405


# Model 2: Decision Tree Regression (CART)

In [19]:
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
tree_y_pred = tree_model.predict(X_test)
tree_r2 = r2_score(y_test, tree_y_pred)

In [20]:
print(f"Decision Tree Regression R-squared: {tree_r2}")

Decision Tree Regression R-squared: -0.6974901442852997


# Model Comparison

In [21]:
print("Model Comparison:")
print(f"Linear Regression R-squared: {linear_r2}")
print(f"Decision Tree Regression R-squared: {tree_r2}")

Model Comparison:
Linear Regression R-squared: 0.048357483667870405
Decision Tree Regression R-squared: -0.6974901442852997


In [22]:
from datetime import datetime

In [23]:
# Make predictions using both models on new predictors
new_predictors = pd.DataFrame({
    'Gender': [0, 1],
    'ScheduledDay': ['2024-04-29T00:00:00Z', '2024-04-29T00:00:00Z'],  
    'AppointmentDay': ['2024-04-29T18:38:08Z', '2024-04-29T16:08:27Z'],  
    'Age': [30, 45],
    'Scholarship': [1, 0],
    'Hipertension': [0, 1],
    'Diabetes': [1, 0],
    'Alcoholism': [0, 0],
    'Handcap': [0, 1],
    'SMS_received': [1, 1]
}, columns=['Gender', 'ScheduledDay', 'AppointmentDay', 'Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received'])

In [24]:
# Convert ISO format to Unix timestamp format for model compatibility
new_predictors['AppointmentDay'] = pd.to_datetime(new_predictors['AppointmentDay']).astype(int) / 10**9
new_predictors['ScheduledDay'] = pd.to_datetime(new_predictors['ScheduledDay']).astype(int) / 10**9

In [25]:
# Encode categorical variables in new predictors
for col in new_predictors.select_dtypes(include=['object']).columns:
    if col in label_encoders:
        new_predictors[col] = label_encoders[col].transform(new_predictors[col])

In [26]:
# Make predictions using both models
linear_new_pred = linear_model.predict(new_predictors)
tree_new_pred = tree_model.predict(new_predictors)

In [27]:
print("\nPredictions using Linear Regression:")
print(linear_new_pred)
print("\nPredictions using Decision Tree Regression:")
print(tree_new_pred)


Predictions using Linear Regression:
[15730809.66850501 15730727.16734266]

Predictions using Decision Tree Regression:
[1. 1.]


# Linear Regression:
Prediction 1: 15730809.67
Prediction 2: 15730727.17

# Decision Tree Regression:
Prediction 1: 1
Prediction 2: 1

Linear regression predicts numerical values, in this case, the predictions are large numerical values. These predictions don't directly represent whether a patient will show up for their appointment or not. 

Decision tree regression predicts categorical values, often in the form of classes or categories. In this case, the predictions are binary (1 or 0). Since the target variable ('No-show') is transformed into binary form (1 for 'Yes' and 0 for 'No'), the decision tree model predicts whether a patient will show up for the appointment (1) or not (0) based on the input features.

In summary, the linear regression model predicts numerical values that don't directly indicate whether a patient will show up or not, while the decision tree regression model predicts binary outcomes representing the likelihood of a patient showing up for the appointment.

In [28]:
# Perform 10-fold cross-validation for Linear Regression
linear_cv_scores = cross_val_score(linear_model, X, y, cv=10, scoring='r2')
print("Linear Regression Cross-Validation R-squared Scores:")
print(linear_cv_scores)
print(f"Mean R-squared: {linear_cv_scores.mean()}")

Linear Regression Cross-Validation R-squared Scores:
[0.03968627 0.07628993 0.06333528 0.06593845 0.03034069 0.05984338
 0.0549876  0.06907902 0.02631173 0.04161396]
Mean R-squared: 0.05274263108488375


In [29]:
# Perform 10-fold cross-validation for Decision Tree Regression
tree_cv_scores = cross_val_score(tree_model, X, y, cv=10, scoring='r2')
print("\nDecision Tree Regression Cross-Validation R-squared Scores:")
print(tree_cv_scores)
print(f"Mean R-squared: {tree_cv_scores.mean()}")


Decision Tree Regression Cross-Validation R-squared Scores:
[-1.01271359 -0.81032999 -0.79154499 -0.78506757 -0.73155538 -0.76555429
 -0.80466516 -0.76521364 -0.91875305 -0.82176083]
Mean R-squared: -0.8207158504961505
