In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [2]:
# Load dataset
file_path = "dummy_npi_data.xlsx"
xls = pd.ExcelFile(file_path)
df = pd.read_excel(xls, sheet_name="Dataset")

In [None]:
# Extract hour from Login Time
df["Login Hour"] = df["Login Time"].dt.hour


In [None]:
# Encode categorical variables
label_encoders = {}
for col in ["State", "Region", "Speciality"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [None]:
# Drop unnecessary columns
df = df.drop(columns=["Login Time", "Logout Time"])

In [None]:

# Define features and target variable
X = df.drop(columns=["NPI", "Count of Survey Attempts"])
y = (df["Count of Survey Attempts"] > 0).astype(int)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model and data
joblib.dump(model, "npi_rf_model.pkl")
df.to_csv("processed_npi_data.csv", index=False)

print("Model and processed data saved successfully!")
