In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from itertools import product

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..')) 
processed_DPW_path = os.path.join(project_root, 'data', 'processed', 'DPW_data_r3y.csv')
df = pd.read_csv(processed_DPW_path)

In [7]:
df.info()
date_columns = ['create_date_et', 'create_date_utc', 'last_action_et', 
                'last_action_utc', 'closed_date_et', 'closed_date_utc']

for col in date_columns:
    df[col] = pd.to_datetime(df[col])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40606 entries, 0 to 40605
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   _id                40606 non-null  int64  
 1   group_id           40606 non-null  int64  
 2   num_requests       40606 non-null  int64  
 3   parent_closed      40606 non-null  object 
 4   status_name        40606 non-null  object 
 5   status_code        40606 non-null  int64  
 6   dept               40606 non-null  object 
 7   request_type_name  40606 non-null  object 
 8   request_type_id    40606 non-null  int64  
 9   create_date_et     40606 non-null  object 
 10  create_date_utc    40606 non-null  object 
 11  last_action_et     40606 non-null  object 
 12  last_action_utc    40606 non-null  object 
 13  closed_date_et     38598 non-null  object 
 14  closed_date_utc    38598 non-null  object 
 15  origin             40606 non-null  object 
 16  street             400

In [8]:
# Geographic Data Cleaning - Focus on Pittsburgh

# 1. Analyze city distribution
city_counts = df['city'].value_counts(dropna=False)
print(f"Cities: {len(city_counts)} unique values")
print(f"Pittsburgh: {city_counts.get('Pittsburgh', 0)} records ({city_counts.get('Pittsburgh', 0)/len(df):.2%})")
print(f"Non-Pittsburgh: {len(df) - city_counts.get('Pittsburgh', 0)} records")

# 2. Filter to Pittsburgh only
df_pittsburgh = df[df['city'] == 'Pittsburgh'].copy()

# 3. Validate cleaning
neighborhood_counts = df_pittsburgh['neighborhood'].value_counts(dropna=False)
print(f"\nAfter filtering:")
print(f"Records: {len(df_pittsburgh)} of {len(df)} ({len(df_pittsburgh)/len(df):.2%})")
print(f"Neighborhoods: {len(neighborhood_counts)} unique values")
print(f"Missing neighborhoods: {neighborhood_counts.get(pd.NA, 0)} records")

# Complete
print("\nPittsburgh-only dataset ready")

Cities: 17 unique values
Pittsburgh: 40472 records (99.67%)
Non-Pittsburgh: 134 records

After filtering:
Records: 40472 of 40606 (99.67%)
Neighborhoods: 92 unique values
Missing neighborhoods: 0 records

Pittsburgh-only dataset ready


In [9]:
df_pittsburgh['date'].value_counts()

date
2025-01-21    496
2025-01-22    487
2024-01-17    384
2025-01-15    313
2025-01-16    260
             ... 
2024-07-27      1
2022-11-25      1
2022-12-04      1
2022-09-04      1
2023-05-06      1
Name: count, Length: 1072, dtype: int64

In [10]:
# Fix the column dropping code
print("Dropping unused columns...")
print(f"Original columns: {len(df_pittsburgh.columns)}")

unused_columns = ['create_date_utc', 'last_action_et', 'last_action_utc', 'closed_date_utc', 'status_code',
                  'group_id', 'num_requests', 'parent_closed', 'request_type_id', 
                  'street', 'cross_street', 'street_id', 'cross_street_id', 'city',
                  'census_tract', 'council_district', 'ward', 'police_zone', 
                  'latitude', 'longitude', 'geo_accuracy', 'date']

# Check which columns actually exist in the dataframe
columns_to_drop = [col for col in unused_columns if col in df_pittsburgh.columns]
print(f"Columns to drop: {len(columns_to_drop)} of {len(unused_columns)} specified")

# Drop the columns
df_clean = df_pittsburgh.drop(columns=columns_to_drop)
print(f"Remaining columns: {len(df_clean.columns)}")

# Show remaining columns
print("\nRemaining columns:")
print(df_clean.columns.tolist())

# Create a final clean dataset with selected features
print("\nCreating final dataset with key features...")

# Calculate any needed derived fields
if 'create_date_et' in df_clean.columns and 'closed_date_et' in df_clean.columns:
    df_clean['resolution_time_days'] = (df_clean['closed_date_et'] - df_clean['create_date_et']).dt.total_seconds() / (24*60*60)

# Create derived temporal features
if 'create_date_et' in df_clean.columns:
    df_clean['hour'] = df_clean['create_date_et'].dt.hour
    df_clean['day_of_week'] = df_clean['create_date_et'].dt.dayofweek  # 0=Monday, 6=Sunday
    df_clean['month'] = df_clean['create_date_et'].dt.month
    df_clean['is_weekend'] = df_clean['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Final dataset info
print(f"Final dataset shape: {df_clean.shape}")
print(f"Final dataset size: {df_clean.memory_usage().sum() / 1048576:.2f} MB")

# Preview
print("\nFinal dataset preview:")
df_clean

Dropping unused columns...
Original columns: 30
Columns to drop: 22 of 22 specified
Remaining columns: 8

Remaining columns:
['_id', 'status_name', 'dept', 'request_type_name', 'create_date_et', 'closed_date_et', 'origin', 'neighborhood']

Creating final dataset with key features...
Final dataset shape: (40472, 13)
Final dataset size: 3.86 MB

Final dataset preview:


Unnamed: 0,_id,status_name,dept,request_type_name,create_date_et,closed_date_et,origin,neighborhood,resolution_time_days,hour,day_of_week,month,is_weekend
0,65,in progress,DPW - Street Maintenance,"Litter, Public Property",2023-07-08 11:38:00,NaT,Website,,,11,5,7,1
1,86,closed,DPW - Street Maintenance,Trail Maintenance,2024-08-28 08:23:00,2024-12-26 12:45:00,Call Center,,120.181944,8,2,8,0
2,177,closed,DPW - Street Maintenance,Street Cleaning/Sweeping,2024-09-05 09:41:00,2024-09-27 06:35:00,Call Center,,21.870833,9,3,9,0
3,454,closed,DPW - Street Maintenance,Trail Maintenance,2024-08-06 11:35:00,2024-12-26 12:51:00,Call Center,,142.052778,11,1,8,0
4,669,open,DPW - Street Maintenance,"Litter, Public Property",2023-05-22 10:12:00,NaT,Website,,,10,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40601,815667,closed,DPW - Street Maintenance,Potholes,2025-02-01 10:09:00,2025-02-03 04:14:00,Call Center,South Side Slopes,1.753472,10,5,2,1
40602,815670,open,DPW - Street Maintenance,Potholes,2025-02-02 16:47:00,NaT,Report2Gov Website,South Side Flats,,16,6,2,1
40603,815674,open,DPW - Street Maintenance,"Litter, Public Property",2025-02-01 08:37:00,NaT,Website,Brookline,,8,5,2,1
40604,815678,open,DPW - Street Maintenance,Drainage/Leak,2025-02-01 21:08:00,NaT,Website,Brookline,,21,5,2,1


After cross tab check and count of each neighbor in city outside Pittsburgh, we decide to **remove all the city outside of Pittsburgh** because:

1. The data is heavily Pittsburgh-centric, with 277,639 requests from Pittsburgh compared to only a few hundred from other cities
2. Other cities have very few neighborhoods (most have 0-1 neighborhoods) and sparse data
3. The neighborhood information is most complete and meaningful for Pittsburgh, with 91 distinct neighborhoods
4. Including other cities would create data imbalance and potentially introduce noise in the analysis
5. The small sample sizes from other cities (ranging from 1-258 requests) make their data statistically less reliable

This decision would help focus the analysis on Pittsburgh's data, which is more comprehensive and representative of the service request patterns we want to analyze.

## Baseline model

In [11]:
df.columns

Index(['_id', 'group_id', 'num_requests', 'parent_closed', 'status_name',
       'status_code', 'dept', 'request_type_name', 'request_type_id',
       'create_date_et', 'create_date_utc', 'last_action_et',
       'last_action_utc', 'closed_date_et', 'closed_date_utc', 'origin',
       'street', 'cross_street', 'street_id', 'cross_street_id', 'city',
       'neighborhood', 'census_tract', 'council_district', 'ward',
       'police_zone', 'latitude', 'longitude', 'geo_accuracy', 'date'],
      dtype='object')

In [12]:
df[['_id','create_date_et','request_type_name','neighborhood']].sort_values(by='create_date_et', ascending=False).head(10)

Unnamed: 0,_id,create_date_et,request_type_name,neighborhood
40562,815578,2025-02-04 14:04:00,Port A Potty,Squirrel Hill South
40572,815592,2025-02-04 13:59:00,"Litter Can, Public",Shadyside
40551,815559,2025-02-04 13:37:00,Potholes,Squirrel Hill North
40544,815544,2025-02-04 13:35:00,Potholes,Shadyside
40589,815632,2025-02-04 12:30:00,Potholes,Bloomfield
40588,815630,2025-02-04 07:37:00,Potholes,Bloomfield
40563,815579,2025-02-03 22:22:00,Potholes,Squirrel Hill North
40592,815636,2025-02-03 21:11:00,Potholes,Bloomfield
40590,815634,2025-02-03 21:09:00,Potholes,Bloomfield
40553,815564,2025-02-03 19:42:00,Potholes,Squirrel Hill South


In [13]:
print((df['create_date_et'].max() - df['create_date_et'].min()).days)
print(df['request_type_name'].nunique())
print(df['neighborhood'].nunique())

## 3. Data Preprocessing - Feature Engineering
### 3.1 Feature Engineering
#### 3.1.1 Create a new column for the request type

### DROP NA neighborhood
df = df[df['neighborhood'].notna()]

### DROP NA request type
df = df[df['request_type_name'].notna()]



1072
65
91


In [14]:
# Create date range
start_date = df['create_date_et'].min()
end_date = df['create_date_et'].max()
all_dates = pd.date_range(start=start_date, end=end_date, freq='H')


  all_dates = pd.date_range(start=start_date, end=end_date, freq='H')


In [20]:
# 1. First create the full cartesian product of all possible combinations

# Create date range
start_date = df['create_date_et'].min()
end_date = df['create_date_et'].min() + pd.Timedelta(days=365)
all_dates = pd.date_range(start=start_date, end=end_date, freq='H')

# Get unique values
neighborhoods = df['neighborhood'].unique()
hours = range(24)

# Create all possible combinations
combinations = list(product(all_dates, neighborhoods))

# Create the base dataframe
matrix_df = pd.DataFrame(combinations, columns=['datetime', 'neighborhood'])

# Extract date and hour
matrix_df['date'] = matrix_df['datetime'].dt.date
matrix_df['hour'] = matrix_df['datetime'].dt.hour

# Create the target variable
# First group the original data
df['datetime'] = pd.to_datetime(df['create_date_et'])
df['date'] = df['datetime'].dt.date
df['hour'] = df['datetime'].dt.hour

# Create a mapping of existing combinations
existing_requests = df.groupby(['date', 'hour', 'neighborhood',]).size().reset_index()
existing_requests['Y'] = 1

# Merge with the full matrix
matrix_df = matrix_df.merge(
    existing_requests[['date', 'hour', 'neighborhood', 'Y']],
    how='left',
    left_on=['date', 'hour', 'neighborhood'],
    right_on=['date', 'hour', 'neighborhood']
)

# Fill missing values with 0 (no request)
matrix_df['Y'] = matrix_df['Y'].fillna(0)

# Final columns organization
final_matrix = matrix_df[['Y', 'date', 'hour', 'neighborhood']]

  all_dates = pd.date_range(start=start_date, end=end_date, freq='H')


In [21]:
final_matrix.head(10)

Unnamed: 0,Y,date,hour,neighborhood
0,0.0,2022-02-28,0,Highland Park
1,0.0,2022-02-28,0,Brookline
2,0.0,2022-02-28,0,Strip District
3,0.0,2022-02-28,0,East Liberty
4,0.0,2022-02-28,0,New Homestead
5,0.0,2022-02-28,0,Bloomfield
6,0.0,2022-02-28,0,Crawford-Roberts
7,0.0,2022-02-28,0,Duquesne Heights
8,0.0,2022-02-28,0,Mount Washington
9,0.0,2022-02-28,0,Middle Hill


In [22]:
final_matrix[final_matrix['neighborhood'] == 'Point Breeze'].head(10)

Unnamed: 0,Y,date,hour,neighborhood
44,0.0,2022-02-28,0,Point Breeze
135,0.0,2022-02-28,1,Point Breeze
226,0.0,2022-02-28,2,Point Breeze
317,0.0,2022-02-28,3,Point Breeze
408,0.0,2022-02-28,4,Point Breeze
499,0.0,2022-02-28,5,Point Breeze
590,0.0,2022-02-28,6,Point Breeze
681,0.0,2022-02-28,7,Point Breeze
772,1.0,2022-02-28,8,Point Breeze
863,0.0,2022-02-28,9,Point Breeze


In [23]:
final_matrix['neighborhood'].value_counts()

neighborhood
Highland Park    8761
Bluff            8761
Friendship       8761
Overbrook        8761
North Oakland    8761
                 ... 
Manchester       8761
North Shore      8761
Bon Air          8761
Morningside      8761
Esplen           8761
Name: count, Length: 91, dtype: int64

### ML model

#### Basic Regression - Logistic Regression

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler

# 1. Feature Engineering (same as before)
final_matrix['date'] = pd.to_datetime(final_matrix['date'])
final_matrix['day_of_week'] = final_matrix['date'].dt.dayofweek
final_matrix['month'] = final_matrix['date'].dt.month
final_matrix['is_weekend'] = final_matrix['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
final_matrix['season'] = final_matrix['month'].apply(lambda x: 
                                                   0 if x in [12, 1, 2] else  # Winter
                                                   1 if x in [3, 4, 5] else   # Spring
                                                   2 if x in [6, 7, 8] else   # Summer
                                                   3)                         # Fall

# 2. One-hot encode neighborhoods
neighborhoods_encoded = pd.get_dummies(final_matrix['neighborhood'], prefix='neighborhood')

# 3. Combine features
X = pd.concat([
    final_matrix[['hour', 'day_of_week', 'month', 'is_weekend', 'season']],
    neighborhoods_encoded
], axis=1)

y = final_matrix['Y']

# 4. Train-test split (time-based)
final_matrix_sorted = final_matrix.sort_values(by='date')
split_index = int(len(final_matrix_sorted) * 0.8)

X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]

# 5. Standardize numeric features (important for logistic regression)
scaler = StandardScaler()
numeric_features = ['hour', 'day_of_week', 'month', 'season']
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# 6. Train logistic regression model
print("Training Logistic Regression model...")
model = LogisticRegression(max_iter=1000, C=0.1, random_state=42)  # C is inverse regularization strength
model.fit(X_train, y_train)

# 7. Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of class 1

# 8. Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f"Logistic Regression Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# 9. Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
}).sort_values('Coefficient', ascending=False)

print("\nTop 10 Features with Positive Effect:")
print(feature_importance.head(10))

print("\nTop 10 Features with Negative Effect:")
print(feature_importance.tail(10))

# 10. Sample prediction function (similar to before)
def predict_request_probability(neighborhood, date_str, hour):
    date = pd.to_datetime(date_str)
    features = {
        'hour': hour,
        'day_of_week': date.dayofweek,
        'month': date.month,
        'is_weekend': 1 if date.dayofweek >= 5 else 0,
        'season': 0 if date.month in [12, 1, 2] else 
                 1 if date.month in [3, 4, 5] else
                 2 if date.month in [6, 7, 8] else 3
    }
    
    # Add neighborhood encoding
    for col in neighborhoods_encoded.columns:
        features[col] = 1 if col == f'neighborhood_{neighborhood}' else 0
    
    # Create DataFrame
    sample = pd.DataFrame([features])
    
    # Scale numeric features
    sample[numeric_features] = scaler.transform(sample[numeric_features])
    
    # Get columns in right order
    sample = sample[X.columns]
    
    # Predict
    return model.predict_proba(sample)[0][1]

# Example predictions
print("\nSample Predictions:")
print(f"Probability of request in Downtown at 8am: {predict_request_probability('Central Business District', '2023-04-01', 8):.4f}")
print(f"Probability of request in South Side Flats at 11pm: {predict_request_probability('South Side Flats', '2023-04-02', 23):.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_matrix['date'] = pd.to_datetime(final_matrix['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numeric_features] = scaler.transform(X_test[numeric_features])

Training Logistic Regression model...
Logistic Regression Performance:
Accuracy: 0.9874
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Top 10 Features with Positive Effect:
                                   Feature  Coefficient
18                  neighborhood_Brookline     1.700139
14                 neighborhood_Bloomfield     1.601525
82        neighborhood_Squirrel Hill South     1.451701
20                    neighborhood_Carrick     1.414192
73                  neighborhood_Shadyside     1.284970
21  neighborhood_Central Business District     1.201808
78          neighborhood_South Side Slopes     1.144382
68               neighborhood_Point Breeze     1.056414
77           neighborhood_South Side Flats     1.034579
44              neighborhood_Highland Park     1.006675

Top 10 Features with Negative Effect:
                              Feature  Coefficient
76           neighborhood_South Shore    -0.874970
35                neighborhood_Esplen    -0.885610
72             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Balanced RF model

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from datetime import datetime

# 1. Feature Engineering
# Convert date to datetime and extract useful features
final_matrix['date'] = pd.to_datetime(final_matrix['date'])
final_matrix['day_of_week'] = final_matrix['date'].dt.dayofweek  # 0=Monday, 6=Sunday
final_matrix['month'] = final_matrix['date'].dt.month
final_matrix['is_weekend'] = final_matrix['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
final_matrix['season'] = final_matrix['month'].apply(lambda x: 
                                                   0 if x in [12, 1, 2] else  # Winter
                                                   1 if x in [3, 4, 5] else   # Spring
                                                   2 if x in [6, 7, 8] else   # Summer
                                                   3)                         # Fall

# 2. Encode categorical variables
# One-hot encode neighborhoods
neighborhoods_encoded = pd.get_dummies(final_matrix['neighborhood'], prefix='neighborhood')

# 3. Combine features
X = pd.concat([
    final_matrix[['hour', 'day_of_week', 'month', 'is_weekend', 'season']],
    neighborhoods_encoded
], axis=1)

y = final_matrix['Y']

# 4. Train-test split (with time-based split)
# Sort by date to maintain time order
final_matrix_sorted = final_matrix.sort_values(by='date')
split_index = int(len(final_matrix_sorted) * 0.8)  # 80% train, 20% test

X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]

# 5. Train model
#  Use class weights for Random Forest
print("Training Balanced Random Forest model...")
balanced_rf = RandomForestClassifier(
    n_estimators=100, 
    class_weight='balanced',  # Add class weights
    random_state=42, 
    n_jobs=-1
)
balanced_rf.fit(X_train, y_train)

# Make predictions
y_pred_balanced = balanced_rf.predict(X_test)

# Evaluate model
balanced_accuracy = accuracy_score(y_test, y_pred_balanced)
balanced_precision, balanced_recall, balanced_f1, _ = precision_recall_fscore_support(
    y_test, y_pred_balanced, average='binary'
)

print(f"Balanced RF Performance:")
print(f"Accuracy: {balanced_accuracy:.4f}")
print(f"Precision: {balanced_precision:.4f}")
print(f"Recall: {balanced_recall:.4f}")
print(f"F1 Score: {balanced_f1:.4f}")


# 9. Create a sample prediction function
def predict_request_probability(neighborhood, date_str, hour):
    """Predict probability of request for a given neighborhood, date and hour"""
    # Parse date
    date = pd.to_datetime(date_str)
    
    # Create feature row
    features = {
        'hour': hour,
        'day_of_week': date.dayofweek,
        'month': date.month,
        'is_weekend': 1 if date.dayofweek >= 5 else 0,
        'season': 0 if date.month in [12, 1, 2] else 
                 1 if date.month in [3, 4, 5] else
                 2 if date.month in [6, 7, 8] else 3
    }
    
    # Add neighborhood one-hot encoding
    for col in neighborhoods_encoded.columns:
        features[col] = 1 if col == f'neighborhood_{neighborhood}' else 0
    
    # Create DataFrame with single row
    sample = pd.DataFrame([features])
    
    # Get columns in right order
    sample = sample[X.columns]
    
    # Predict
    return model.predict_proba(sample)[0][1]  # Probability of class 1

# Example prediction
print("\nSample Predictions:")
print(f"Probability of request in Downtown tomorrow at 8am: {predict_request_probability('Central Business District', '2023-04-01', 8):.4f}")
print(f"Probability of request in South Side Flats on weekend at 11pm: {predict_request_probability('South Side Flats', '2023-04-02', 23):.4f}")

Training Balanced Random Forest model...
Balanced RF Performance:
Accuracy: 0.9725
Precision: 0.0592
Recall: 0.0789
F1 Score: 0.0677

Sample Predictions:
Probability of request in Downtown tomorrow at 8am: 0.0053
Probability of request in South Side Flats on weekend at 11pm: 0.0221


### Balanced RF + SMOTE

In [26]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
print("Applying SMOTE oversampling...")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print(f"Class distribution after SMOTE:")
print(f"Class 0 (no request): {sum(y_train_smote == 0)}")
print(f"Class 1 (request): {sum(y_train_smote == 1)}")
print(f"Ratio: {sum(y_train_smote == 1)/len(y_train_smote):.2f}")

# Train Random Forest on SMOTE-resampled data
print("\nTraining RF with SMOTE data...")
rf_smote = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
    # No class_weight needed since SMOTE already balanced the classes
)
rf_smote.fit(X_train_smote, y_train_smote)

# Make predictions
y_pred_smote = rf_smote.predict(X_test)
y_pred_proba_smote = rf_smote.predict_proba(X_test)[:, 1]

# Evaluate model
smote_accuracy = accuracy_score(y_test, y_pred_smote)
smote_precision, smote_recall, smote_f1, _ = precision_recall_fscore_support(
    y_test, y_pred_smote, average='binary'
)

print(f"RF+SMOTE Performance:")
print(f"Accuracy: {smote_accuracy:.4f}")
print(f"Precision: {smote_precision:.4f}")
print(f"Recall: {smote_recall:.4f}")
print(f"F1 Score: {smote_f1:.4f}")

# Sample predictions with SMOTE-trained model
def predict_request_probability_smote(neighborhood, date_str, hour):
    # Parse date
    date = pd.to_datetime(date_str)
    
    # Create feature row
    features = {
        'hour': hour,
        'day_of_week': date.dayofweek,
        'month': date.month,
        'is_weekend': 1 if date.dayofweek >= 5 else 0,
        'season': 0 if date.month in [12, 1, 2] else 
                 1 if date.month in [3, 4, 5] else
                 2 if date.month in [6, 7, 8] else 3
    }
    
    # Add neighborhood one-hot encoding
    for col in neighborhoods_encoded.columns:
        features[col] = 1 if col == f'neighborhood_{neighborhood}' else 0
    
    # Create DataFrame with single row
    sample = pd.DataFrame([features])
    
    # Get columns in right order
    sample = sample[X.columns]
    
    # Predict
    return rf_smote.predict_proba(sample)[0][1]

print("\nSample Predictions with RF+SMOTE:")
print(f"Probability of request in Downtown tomorrow at 8am: {predict_request_probability_smote('Central Business District', '2023-04-01', 8):.4f}")
print(f"Probability of request in South Side Flats on weekend at 11pm: {predict_request_probability_smote('South Side Flats', '2023-04-02', 23):.4f}")

Applying SMOTE oversampling...
Class distribution after SMOTE:
Class 0 (no request): 627433
Class 1 (request): 627433
Ratio: 0.50

Training RF with SMOTE data...


KeyboardInterrupt: 

---

In [None]:
# Convert to count-based approach
request_counts = df.groupby(['date', 'hour', 'neighborhood']).size().reset_index(name='request_count')

# Feature engineering (same as before)
request_counts['date'] = pd.to_datetime(request_counts['date'])
request_counts['day_of_week'] = request_counts['date'].dt.dayofweek
request_counts['month'] = request_counts['date'].dt.month
request_counts['is_weekend'] = request_counts['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
request_counts['season'] = request_counts['month'].apply(lambda x: 
                                                   0 if x in [12, 1, 2] else  # Winter
                                                   1 if x in [3, 4, 5] else   # Spring
                                                   2 if x in [6, 7, 8] else   # Summer
                                                   3)                         # Fall

# One-hot encode neighborhoods
neighborhoods_encoded = pd.get_dummies(request_counts['neighborhood'], prefix='neighborhood')

# Prepare features
X = pd.concat([
    request_counts[['hour', 'day_of_week', 'month', 'is_weekend', 'season']],
    neighborhoods_encoded
], axis=1)

y = request_counts['request_count']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Option 1: Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

# Option 2: Poisson Regression (specifically for count data)
from sklearn.linear_model import PoissonRegressor
poisson_model = PoissonRegressor(alpha=0.1)
poisson_model.fit(X_train, y_train)

# Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error
rf_preds = rf_reg.predict(X_test)
poisson_preds = poisson_model.predict(X_test)

print("Random Forest Regressor:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, rf_preds)):.4f}")
print(f"MAE: {mean_absolute_error(y_test, rf_preds):.4f}")

print("\nPoisson Regressor:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, poisson_preds)):.4f}")
print(f"MAE: {mean_absolute_error(y_test, poisson_preds):.4f}")

Random Forest Regressor:
RMSE: 1.7766
MAE: 0.7891

Poisson Regressor:
RMSE: 1.6913
MAE: 0.7988


In [None]:
# Sample predictions with Random Forest
print("\nSample Random Forest Predictions:")
# Create a function similar to before
def predict_request_count(model, neighborhood, date_str, hour):
    date = pd.to_datetime(date_str)
    features = {
        'hour': hour,
        'day_of_week': date.dayofweek,
        'month': date.month,
        'is_weekend': 1 if date.dayofweek >= 5 else 0,
        'season': 0 if date.month in [12, 1, 2] else 
                 1 if date.month in [3, 4, 5] else
                 2 if date.month in [6, 7, 8] else 3
    }
    
    # Add neighborhood encoding
    for col in neighborhoods_encoded.columns:
        features[col] = 1 if col == f'neighborhood_{neighborhood}' else 0
    
    sample = pd.DataFrame([features])
    sample = sample[X.columns]
    
    return model.predict(sample)[0]

# Show predictions for popular neighborhoods at different times
print(f"Expected requests in Downtown (8am): {predict_request_count(rf_reg, 'Central Business District', '2023-04-01', 8):.2f}")
print(f"Expected requests in South Side Flats (11pm): {predict_request_count(rf_reg, 'South Side Flats', '2023-04-01', 23):.2f}")
print(f"Expected requests in Squirrel Hill South (2pm): {predict_request_count(rf_reg, 'Squirrel Hill South', '2023-04-01', 14):.2f}")
print(f"Expected requests in Brookline (9am): {predict_request_count(rf_reg, 'Brookline', '2023-04-01', 9):.2f}")

# Compare with actual counts from test data
print("\nActual vs Predicted for some test samples:")
sample_indices = np.random.choice(len(X_test), 5, replace=False)
for idx in sample_indices:
    actual = y_test.iloc[idx]
    predicted_rf = rf_preds[idx]
    predicted_poisson = poisson_preds[idx]
    
    # Get neighborhood name (need to reverse the one-hot encoding)
    neighborhood_cols = [col for col in X_test.columns if col.startswith('neighborhood_')]
    neighborhood_idx = np.argmax(X_test.iloc[idx][neighborhood_cols])
    neighborhood = neighborhood_cols[neighborhood_idx].replace('neighborhood_', '')
    
    hour = X_test.iloc[idx]['hour']
    
    print(f"Neighborhood: {neighborhood}, Hour: {int(hour)}")
    print(f"  Actual: {actual:.1f}, RF: {predicted_rf:.1f}, Poisson: {predicted_poisson:.1f}")


Sample Random Forest Predictions:
Expected requests in Downtown (8am): 1.03
Expected requests in South Side Flats (11pm): 1.61
Expected requests in Squirrel Hill South (2pm): 1.63
Expected requests in Brookline (9am): 1.00

Actual vs Predicted for some test samples:
Neighborhood: Lincoln Place, Hour: 8
  Actual: 1.0, RF: 1.5, Poisson: 1.6
Neighborhood: Brookline, Hour: 6
  Actual: 2.0, RF: 1.3, Poisson: 1.8
Neighborhood: Squirrel Hill South, Hour: 15
  Actual: 1.0, RF: 1.1, Poisson: 1.6
Neighborhood: Spring Garden, Hour: 15
  Actual: 1.0, RF: 1.2, Poisson: 1.5
Neighborhood: South Oakland, Hour: 15
  Actual: 1.0, RF: 1.0, Poisson: 1.6
