In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from fastapi import FastAPI, Query
import pandas as pd
import joblib
from datetime import datetime
from fastapi.responses import FileResponse
import unicorn

In [49]:
df = pd.read_excel(r"C:\Users\POOJA\Downloads\dummy_npi_data.xlsx")

In [50]:
df.head()

Unnamed: 0,NPI,State,Login Time,Logout Time,Usage Time (mins),Region,Speciality,Count of Survey Attempts
0,1000000000,NY,2025-03-08 06:09:00,2025-03-08 06:28:00,19,Northeast,Cardiology,3
1,1000000001,MI,2025-03-08 12:28:00,2025-03-08 13:10:00,42,Midwest,Oncology,5
2,1000000002,CA,2025-03-08 15:11:00,2025-03-08 15:37:00,26,West,Oncology,8
3,1000000003,TX,2025-03-08 14:17:00,2025-03-08 15:36:00,79,Northeast,Orthopedics,9
4,1000000004,GA,2025-03-08 15:59:00,2025-03-08 17:37:00,98,West,Oncology,0


In [51]:
df.shape

(1000, 8)

In [52]:
df.isnull().sum()

NPI                         0
State                       0
Login Time                  0
Logout Time                 0
Usage Time (mins)           0
Region                      0
Speciality                  0
Count of Survey Attempts    0
dtype: int64

In [53]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
NPI,1000.0,1000000499.5,1000000000.0,1000000249.75,1000000499.5,1000000749.25,1000000999.0,288.819436
Login Time,1000.0,2025-03-08 13:20:15.300000,2025-03-08 06:00:00,2025-03-08 09:42:30,2025-03-08 13:10:30,2025-03-08 17:03:15,2025-03-08 20:58:00,
Logout Time,1000.0,2025-03-08 14:24:48.540000256,2025-03-08 06:08:00,2025-03-08 10:42:45,2025-03-08 14:15:30,2025-03-08 18:08:15,2025-03-08 22:46:00,
Usage Time (mins),1000.0,64.554,5.0,36.0,66.0,93.25,120.0,33.294288
Count of Survey Attempts,1000.0,4.991,0.0,2.0,5.0,8.0,10.0,3.13955


In [54]:
df.describe(include=object).T

Unnamed: 0,count,unique,top,freq
State,1000,10,IL,117
Region,1000,4,West,258
Speciality,1000,7,Oncology,154


In [55]:
df.columns

Index(['NPI', 'State', 'Login Time', 'Logout Time', 'Usage Time (mins)',
       'Region', 'Speciality', 'Count of Survey Attempts'],
      dtype='object')

In [56]:
num_col = df.select_dtypes(include=['int','float']).columns
cat_Col = df.select_dtypes(exclude=['int','float']).columns.tolist()
print(num_col)
print(cat_Col)

Index(['NPI', 'Usage Time (mins)', 'Count of Survey Attempts'], dtype='object')
['State', 'Login Time', 'Logout Time', 'Region', 'Speciality']


In [57]:
df['Login Time']

0     2025-03-08 06:09:00
1     2025-03-08 12:28:00
2     2025-03-08 15:11:00
3     2025-03-08 14:17:00
4     2025-03-08 15:59:00
              ...        
995   2025-03-08 11:54:00
996   2025-03-08 17:09:00
997   2025-03-08 18:44:00
998   2025-03-08 17:21:00
999   2025-03-08 14:01:00
Name: Login Time, Length: 1000, dtype: datetime64[ns]

In [58]:
# df['Login Time'] = str(df['Login Time'])
# df['Logout Time'] = str(df['Logout Time'])

In [59]:
df.dtypes

NPI                                  int64
State                               object
Login Time                  datetime64[ns]
Logout Time                 datetime64[ns]
Usage Time (mins)                    int64
Region                              object
Speciality                          object
Count of Survey Attempts             int64
dtype: object

In [60]:
df['Usage Time (mins)'].max()

np.int64(120)

In [61]:
print(df['Region'].unique()) 
print(df['Region'].nunique())

['Northeast' 'Midwest' 'West' 'South']
4


In [62]:

print(df['Speciality'].unique())
print(df['Speciality'].nunique())

['Cardiology' 'Oncology' 'Orthopedics' 'General Practice' 'Pediatrics'
 'Neurology' 'Radiology']
7


In [63]:
df.columns

Index(['NPI', 'State', 'Login Time', 'Logout Time', 'Usage Time (mins)',
       'Region', 'Speciality', 'Count of Survey Attempts'],
      dtype='object')

In [64]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   NPI                       1000 non-null   int64         
 1   State                     1000 non-null   object        
 2   Login Time                1000 non-null   datetime64[ns]
 3   Logout Time               1000 non-null   datetime64[ns]
 4   Usage Time (mins)         1000 non-null   int64         
 5   Region                    1000 non-null   object        
 6   Speciality                1000 non-null   object        
 7   Count of Survey Attempts  1000 non-null   int64         
dtypes: datetime64[ns](2), int64(3), object(3)
memory usage: 62.6+ KB


In [65]:

# Convert time-related fields to datetime format
df['Login Time'] = pd.to_datetime(df['Login Time'], errors='coerce')
df['Logout Time'] = pd.to_datetime(df['Logout Time'], errors='coerce')


In [66]:

# Remove rows with invalid datetime values
df.dropna(subset=['Login Time', 'Logout Time'], inplace=True)


In [67]:
df

Unnamed: 0,NPI,State,Login Time,Logout Time,Usage Time (mins),Region,Speciality,Count of Survey Attempts
0,1000000000,NY,2025-03-08 06:09:00,2025-03-08 06:28:00,19,Northeast,Cardiology,3
1,1000000001,MI,2025-03-08 12:28:00,2025-03-08 13:10:00,42,Midwest,Oncology,5
2,1000000002,CA,2025-03-08 15:11:00,2025-03-08 15:37:00,26,West,Oncology,8
3,1000000003,TX,2025-03-08 14:17:00,2025-03-08 15:36:00,79,Northeast,Orthopedics,9
4,1000000004,GA,2025-03-08 15:59:00,2025-03-08 17:37:00,98,West,Oncology,0
...,...,...,...,...,...,...,...,...
995,1000000995,PA,2025-03-08 11:54:00,2025-03-08 12:54:00,60,West,Neurology,0
996,1000000996,NY,2025-03-08 17:09:00,2025-03-08 17:52:00,43,Midwest,Neurology,8
997,1000000997,CA,2025-03-08 18:44:00,2025-03-08 19:46:00,62,Northeast,Cardiology,1
998,1000000998,NY,2025-03-08 17:21:00,2025-03-08 19:04:00,103,Midwest,Oncology,6


In [68]:

# Compute Active Duration in minutes
df['Active Duration'] = (df['Logout Time'] - df['Login Time']).dt.total_seconds() / 60


In [69]:
df.head()

Unnamed: 0,NPI,State,Login Time,Logout Time,Usage Time (mins),Region,Speciality,Count of Survey Attempts,Active Duration
0,1000000000,NY,2025-03-08 06:09:00,2025-03-08 06:28:00,19,Northeast,Cardiology,3,19.0
1,1000000001,MI,2025-03-08 12:28:00,2025-03-08 13:10:00,42,Midwest,Oncology,5,42.0
2,1000000002,CA,2025-03-08 15:11:00,2025-03-08 15:37:00,26,West,Oncology,8,26.0
3,1000000003,TX,2025-03-08 14:17:00,2025-03-08 15:36:00,79,Northeast,Orthopedics,9,79.0
4,1000000004,GA,2025-03-08 15:59:00,2025-03-08 17:37:00,98,West,Oncology,0,98.0


In [70]:

# Drop unneeded columns if necessary (modify as needed)
# df.drop(columns=['Unnecessary_Column'], inplace=True)

# Handle categorical variables
df = pd.get_dummies(df, columns=['Speciality', 'Region'], drop_first=True)


In [71]:

# Save cleaned dataset for further use
df.to_csv("cleaned_dataset.csv", index=False)

print("Data preprocessing complete. Cleaned dataset saved as 'cleaned_dataset.csv'.")


Data preprocessing complete. Cleaned dataset saved as 'cleaned_dataset.csv'.


In [72]:
# Load the cleaned dataset
df = pd.read_csv(r"D:\certisured ML\Machine Learning\my own\cleaned_dataset.csv")


In [73]:
df.head()

Unnamed: 0,NPI,State,Login Time,Logout Time,Usage Time (mins),Count of Survey Attempts,Active Duration,Speciality_General Practice,Speciality_Neurology,Speciality_Oncology,Speciality_Orthopedics,Speciality_Pediatrics,Speciality_Radiology,Region_Northeast,Region_South,Region_West
0,1000000000,NY,2025-03-08 06:09:00,2025-03-08 06:28:00,19,3,19.0,False,False,False,False,False,False,True,False,False
1,1000000001,MI,2025-03-08 12:28:00,2025-03-08 13:10:00,42,5,42.0,False,False,True,False,False,False,False,False,False
2,1000000002,CA,2025-03-08 15:11:00,2025-03-08 15:37:00,26,8,26.0,False,False,True,False,False,False,False,False,True
3,1000000003,TX,2025-03-08 14:17:00,2025-03-08 15:36:00,79,9,79.0,False,False,False,True,False,False,True,False,False
4,1000000004,GA,2025-03-08 15:59:00,2025-03-08 17:37:00,98,0,98.0,False,False,True,False,False,False,False,False,True


In [74]:
# Define the target variable: infer attendance based on login trends
# Assuming doctors who logged in frequently around a given time are more likely to attend

# Creating a new target variable based on login time
threshold_minutes = 30  # Considered active if logged in within this window


In [75]:
# Ensure 'Login Time' is in numeric hour format
df['Login Time'] = pd.to_datetime(df['Login Time'], errors='coerce').dt.hour

# Find peak login hour (most common login time)
peak_login_time = df['Login Time'].mode()[0]  

# Define attendance based on login trends (within ±30 minutes converted to hour range)
df['Attended Survey'] = ((df['Login Time'] >= (peak_login_time - 1)) & 
                         (df['Login Time'] <= (peak_login_time + 1))).astype(int)


In [76]:
df['Logout Time'] = pd.to_datetime(df['Logout Time'], errors='coerce').dt.hour

In [77]:
peak_login_time

np.int32(13)

In [78]:
# df['NPI'] = df['NPI']-1000000000

In [79]:
df.head()

Unnamed: 0,NPI,State,Login Time,Logout Time,Usage Time (mins),Count of Survey Attempts,Active Duration,Speciality_General Practice,Speciality_Neurology,Speciality_Oncology,Speciality_Orthopedics,Speciality_Pediatrics,Speciality_Radiology,Region_Northeast,Region_South,Region_West,Attended Survey
0,1000000000,NY,6,6,19,3,19.0,False,False,False,False,False,False,True,False,False,0
1,1000000001,MI,12,13,42,5,42.0,False,False,True,False,False,False,False,False,False,1
2,1000000002,CA,15,15,26,8,26.0,False,False,True,False,False,False,False,False,True,0
3,1000000003,TX,14,15,79,9,79.0,False,False,False,True,False,False,True,False,False,1
4,1000000004,GA,15,17,98,0,98.0,False,False,True,False,False,False,False,False,True,0


In [80]:

# Define features and target
y = df['Attended Survey']


In [81]:
# Identify categorical columns dynamically
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()  # Detect text columns

# Apply one-hot encoding to categorical features
X = pd.get_dummies(df.drop(columns=['Attended Survey']), columns=categorical_cols, drop_first=True)


In [82]:

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [83]:
X_train.dtypes

NPI                              int64
Login Time                       int32
Logout Time                      int32
Usage Time (mins)                int64
Count of Survey Attempts         int64
Active Duration                float64
Speciality_General Practice       bool
Speciality_Neurology              bool
Speciality_Oncology               bool
Speciality_Orthopedics            bool
Speciality_Pediatrics             bool
Speciality_Radiology              bool
Region_Northeast                  bool
Region_South                      bool
Region_West                       bool
State_FL                          bool
State_GA                          bool
State_IL                          bool
State_MI                          bool
State_NC                          bool
State_NY                          bool
State_OH                          bool
State_PA                          bool
State_TX                          bool
dtype: object

In [84]:

# Train Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [85]:

# Evaluate Model
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')


Accuracy: 1.00


In [86]:

# Save Model
joblib.dump(model, 'npi_survey_model.pkl')
print("Model training complete. Model saved as 'npi_survey_model.pkl'.")


Model training complete. Model saved as 'npi_survey_model.pkl'.


In [87]:
app = FastAPI()


In [88]:

# Load the trained model
model = joblib.load(r"D:\certisured ML\Machine Learning\my own\npi_survey_model.pkl")

df = pd.read_csv(r"D:\certisured ML\Machine Learning\my own\cleaned_dataset.csv")  # Load dataset for reference


In [89]:
print(df.columns)


Index(['NPI', 'State', 'Login Time', 'Logout Time', 'Usage Time (mins)',
       'Count of Survey Attempts', 'Active Duration',
       'Speciality_General Practice', 'Speciality_Neurology',
       'Speciality_Oncology', 'Speciality_Orthopedics',
       'Speciality_Pediatrics', 'Speciality_Radiology', 'Region_Northeast',
       'Region_South', 'Region_West'],
      dtype='object')


In [None]:
df['Login Time'] = pd.to_datetime(df['Login Time'], errors='coerce').dt.hour

In [None]:
@app.get("/predict/")
def predict_survey(time: str = Query(..., description="Enter time in HH:MM format")):
    """
    Predicts which NPIs (doctors) are most likely to attend the survey at the given time.
    Returns a downloadable CSV file.
    """
    try:
        # Convert input time to hour
        input_hour = datetime.strptime(time, "%H:%M").hour

        if 'Login Time' not in df.columns:
            return {"error": "Column 'Login Time' not found in dataset. Check CSV file."}

        eligible_doctors = df[(df['Login Time'] >= input_hour - 1) & (df['Login Time'] <= input_hour + 1)]


        if eligible_doctors.empty:
            return {"message": "No doctors found for this time."}

        # Predict likelihood of attending survey
        X_input = eligible_doctors.drop(columns=['NPI'])  # Remove identifier column if needed
        predictions = model.predict(X_input)

        # Select only doctors predicted to attend
        eligible_doctors['Prediction'] = predictions
        final_doctors = eligible_doctors[eligible_doctors['Prediction'] == 1]

        if final_doctors.empty:
            return {"message": "No doctors predicted to attend at this time."}

        # Save to CSV
        output_file = "predicted_doctors.csv"
        final_doctors[['NPI']].to_csv(output_file, index=False)

        return FileResponse(output_file, filename="predicted_doctors.csv", media_type='text/csv')

    except Exception as e:
        return {"error": str(e)}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000, loop="uvloop" if hasattr(uvicorn, 'loop') else None)


AttributeError: module 'unicorn' has no attribute 'run'