In [1]:
import pandas as pd
import requests
from joblib import load, dump  # Import the dump function
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")


# URL of the JSON endpoint
url = "https://data.seattle.gov/resource/kzjm-xkqj.json"

# Send a GET request to the URL
response = requests.get(url)

# Read JSON data into a DataFrame regardless of the response status code
df = pd.read_json(response.text)

# Drop columns from 7 to 11
df = df.drop(df.columns[7:12], axis=1)

# Drop rows with missing values
df = df.dropna()

# Handle outliers using the IQR method
Q1_long = df['longitude'].quantile(0.25)
Q3_long = df['longitude'].quantile(0.75)
IQR_long = Q3_long - Q1_long
outlier_threshold_long = 1.5 * IQR_long

Q1_lat = df['latitude'].quantile(0.25)
Q3_lat = df['latitude'].quantile(0.75)
IQR_lat = Q3_lat - Q1_lat
outlier_threshold_lat = 1.5 * IQR_lat

# Removing outliers for both Longitude and Latitude
df = df[
    (df['longitude'] >= (Q1_long - outlier_threshold_long)) &
    (df['longitude'] <= (Q3_long + outlier_threshold_long)) &
    (df['latitude'] >= (Q1_lat - outlier_threshold_lat)) &
    (df['latitude'] <= (Q3_lat + outlier_threshold_lat))
]

df = df.copy()

# Converting the datatype to datetime
df['datetime'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %I:%M:%S %p')

# Splitting the date field into separate columns
df['Month'] = df['datetime'].dt.month
df['Hour'] = df['datetime'].dt.hour
df['AM_PM'] = df['datetime'].dt.strftime('%p')  

# Dropping the Date column for simple processing:
df = df.drop(['datetime', 'report_location', 'incident_number', 'address'], axis=1)

# Encoding the 'Type' column using Label Encoding
label_encoder = LabelEncoder()
df['Encoded_Type'] = label_encoder.fit_transform(df['type'])

# Dropping the type column after encoding
df = df.drop(['type'], axis=1)

# Mapping the corresponding values to "AM_PM" field to make it a numerical column
df['AM_PM'] = df['AM_PM'].map({'AM': 1, 'PM': 0})
# Now all the fields have been converted to numerical data

boundary_longitude = 180.0

# Function to convert negative longitude to positive while maintaining the geographical meaning
def convert_longitude(longitude):
    positive_longitude = boundary_longitude + longitude
    if positive_longitude > 180.0:
        positive_longitude -= 360.0
    return positive_longitude

# Apply the conversion function to the 'Longitude' column
df['longitude'] = df['longitude'].apply(convert_longitude)

# Define the features (latitude, longitude, Month, Hour, AM_PM)
X = df[['latitude', 'longitude', 'Month', 'Hour', 'AM_PM']]

# Define the target variable (Encoded_Type)
y = df['Encoded_Type']

# Instantiate RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Resample the dataset
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Create a new DataFrame with balanced data
balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
balanced_df['Encoded_Type'] = y_resampled

# Preparing the data to predict the time or temporal patterns of emergency calls
X = df.drop(columns=['Encoded_Type'])
y = df['Encoded_Type']

# Preprocessing: Standardize numerical variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the test data to a CSV file
test_data_csv_path = "/Users/shubhamgaur/Desktop/HackPOCTest.csv"
test_data_json_path = "/Users/shubhamgaur/Desktop/HackPOCTest.json"

test_df = X_test.copy()
test_df['Encoded_Type'] = y_test

# Save test data to CSV
test_df.to_csv(test_data_csv_path, index=False)
print("Test data saved successfully to CSV:", test_data_csv_path)

# Save test data to JSON
test_df.to_json(test_data_json_path, orient='records')
print("Test data saved successfully to JSON:", test_data_json_path)

# Instantiate RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Resample the training dataset
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

# Create a new DataFrame with balanced data
X_train_resampled = pd.DataFrame(X_resampled, columns=X_train.columns)
y_train_resampled = pd.Series(y_resampled)

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000) 

logistic_model.fit(X_train_resampled, y_train_resampled)
logistic_predic = logistic_model.predict(X_test)
# Calculating the values
log_accuracy = accuracy_score(y_test, logistic_predic)
log_preci = precision_score(y_test, logistic_predic, zero_division=0, average='macro')
log_recall = recall_score(y_test, logistic_predic, average='macro')

print("Logistic Regression Accuracy:", log_accuracy)
print("Logistic Regression Precision:", log_preci)
print("Logistic Regression Recall:", log_recall)

# Save logistic regression model
dump(logistic_model, 'logistic_regression_model.joblib')

# Perform grid search for K-NN model
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Create K-NN model
knn_model = KNeighborsClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters and model
best_params = grid_search.best_params_
best_knn_model = grid_search.best_estimator_

# Predict using the best model
knn_pred = best_knn_model.predict(X_test)

# Calculate evaluation metrics
knn_acc = accuracy_score(y_test, knn_pred)
knn_prec = precision_score(y_test, knn_pred, zero_division=0, average='macro')
knn_recall = recall_score(y_test, knn_pred, zero_division=0, average='macro')

# Print evaluation metrics
print("Best K-NN model parameters:", best_params)
print("K-NN Accuracy:", knn_acc)
print("K-NN Precision:", knn_prec)
print("K-NN Recall:", knn_recall)

# Save K-NN model
dump(best_knn_model, 'knn_model.joblib')


Test data saved successfully to CSV: /Users/shubhamgaur/Desktop/HackPOCTest.csv
Test data saved successfully to JSON: /Users/shubhamgaur/Desktop/HackPOCTest.json
Logistic Regression Accuracy: 0.01015228426395939
Logistic Regression Precision: 0.003774680603948897
Logistic Regression Recall: 0.03252032520325203
Best K-NN model parameters: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
K-NN Accuracy: 0.09644670050761421
K-NN Precision: 0.037538580246913575
K-NN Recall: 0.07841050275260802


['knn_model.joblib']