In [None]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# Load complaint data
complaint_df = pd.read_csv("D:\\Rekha\\Capstone\\Data\\Cleaned\\311_cleaned_final.csv", low_memory=False)
complaint_df['created_date'] = pd.to_datetime(complaint_df['created_date'], errors='coerce')
complaint_df = complaint_df.dropna(subset=['incident_zip'])

# Clean ZIP code
complaint_df['incident_zip'] = complaint_df['incident_zip'].astype(str).str.split('.').str[0].str.zfill(5)

# Complaint volume per ZIP
complaints_per_zip = complaint_df.groupby('incident_zip').size().reset_index(name='total_complaints')

# Load demographic data
zip_demo = pd.read_csv("D:\\Rekha\\Capstone\\Data\\nyc_zip_income_pop_gender_2019_2022.csv")
zip_demo = zip_demo[zip_demo['Year'] == 2022]
zip_demo['ZIP'] = zip_demo['ZIP'].astype(str).str.zfill(5)

# Merge datasets
merged = complaints_per_zip.merge(zip_demo, left_on='incident_zip', right_on='ZIP', how='left')
merged = merged.dropna(subset=['Total Population', 'Median Household Income', 'Male Population', 'Female Population'])

# Feature engineering
merged['complaints_per_capita'] = merged['total_complaints'] / merged['Total Population']
merged['gender_ratio'] = merged['Male Population'] / (merged['Female Population'] + 1e-6)

# Linear regression model
X = merged[['Median Household Income', 'Total Population', 'gender_ratio']]
X = sm.add_constant(X)
y = merged['complaints_per_capita']
model = sm.OLS(y, X).fit()

# Visualization
plt.figure(figsize=(10, 6))
sns.regplot(data=merged, x='Median Household Income', y='complaints_per_capita', scatter_kws={'alpha': 0.6})
plt.title("Complaint Volume per Capita vs. Median Household Income")
plt.xlabel("Median Household Income (USD)")
plt.ylabel("Complaints per Capita")
plt.grid(True)
plt.tight_layout()
plt.savefig("D:\\Rekha\\Capstone\\Visualizations\\Figure 11.png", dpi=300)
plt.show()

# Model summary
model.summary()



In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load data
complaints = pd.read_csv("D:\\Rekha\\Capstone\\Data\\Cleaned\\311_cleaned_final.csv", low_memory=False)
demographics = pd.read_csv("D:\\Rekha\\Capstone\\Data\\nyc_zip_income_pop_gender_2019_2022.csv")
demographics = demographics[demographics["Year"] == 2022]
demographics["ZIP"] = demographics["ZIP"].astype(str).str.zfill(5)

# Preprocessing
complaints['created_date'] = pd.to_datetime(complaints['created_date'], errors='coerce')
complaints['closed_date'] = pd.to_datetime(complaints['closed_date'], errors='coerce')
complaints = complaints.dropna(subset=['created_date', 'closed_date', 'incident_zip', 'complaint_type', 'open_data_channel_type'])

complaints['incident_zip'] = complaints['incident_zip'].astype(str).str.split('.').str[0].str.zfill(5)
complaints['hour'] = complaints['created_date'].dt.hour

# Merge income
merged = complaints.merge(demographics[['ZIP', 'Median Household Income']], left_on='incident_zip', right_on='ZIP', how='left')
merged = merged.dropna(subset=['Median Household Income'])

# Target variables
merged['is_smartphone'] = merged['open_data_channel_type'].str.upper().eq('MOBILE').astype(int)
merged['closed_within_24h'] = ((merged['closed_date'] - merged['created_date']).dt.total_seconds() < 86400).astype(int)

# Encode complaint type
le = LabelEncoder()
merged['complaint_encoded'] = le.fit_transform(merged['complaint_type'])

# Features
features = ['complaint_encoded', 'Median Household Income', 'hour']

# --- Model 1: Predict if complaint came via smartphone ---
X1 = merged[features]
y1 = merged['is_smartphone']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)


model1 = LogisticRegression(max_iter=1000, class_weight='balanced') 
model1.fit(X_train1, y_train1)
preds1 = model1.predict(X_test1)

print("Smartphone Complaint Prediction:\n")
print(classification_report(y_test1, preds1))

# --- Model 2: Predict if closed within 24 hours ---
X2 = merged[features]
y2 = merged['closed_within_24h']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

model2 = LogisticRegression(max_iter=1000, class_weight='balanced')
model2.fit(X_train2, y_train2)
preds2 = model2.predict(X_test2)

print("\nClosure Within 24 Hours Prediction:\n")
print(classification_report(y_test2, preds2))


Naive Bayes

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Load data
complaints = pd.read_csv("D:\\Rekha\\Capstone\\Data\\Cleaned\\311_cleaned_final.csv", low_memory=False)
demographics = pd.read_csv("D:\\Rekha\\Capstone\\Data\\nyc_zip_income_pop_gender_2019_2022.csv")
demographics = demographics[demographics["Year"] == 2022]
demographics["ZIP"] = demographics["ZIP"].astype(str).str.zfill(5)

# Preprocessing
complaints['created_date'] = pd.to_datetime(complaints['created_date'], errors='coerce')
complaints['closed_date'] = pd.to_datetime(complaints['closed_date'], errors='coerce')
complaints = complaints.dropna(subset=['created_date', 'closed_date', 'incident_zip', 'complaint_type', 'open_data_channel_type'])

complaints['incident_zip'] = complaints['incident_zip'].astype(str).str.split('.').str[0].str.zfill(5)
complaints['hour'] = complaints['created_date'].dt.hour

# Merge demographic info
merged = complaints.merge(demographics[['ZIP', 'Median Household Income']], left_on='incident_zip', right_on='ZIP', how='left')
merged = merged.dropna(subset=['Median Household Income'])

# Target Variables
merged['is_smartphone'] = merged['open_data_channel_type'].str.upper().eq('MOBILE').astype(int)
merged['closed_within_24h'] = ((merged['closed_date'] - merged['created_date']).dt.total_seconds() < 86400).astype(int)

# Encode complaint type
le = LabelEncoder()
merged['complaint_encoded'] = le.fit_transform(merged['complaint_type'])

# Common features
features = ['complaint_encoded', 'Median Household Income', 'hour']

# ----------------------
# Model 1: Predict Smartphone Complaints
# ----------------------
X1 = merged[features]
y1 = merged['is_smartphone']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

nb1 = GaussianNB()
nb1.fit(X_train1, y_train1)
preds1 = nb1.predict(X_test1)

print("Model 1: Predicting Smartphone Complaints\n")
print(classification_report(y_test1, preds1))

# ----------------------
# Model 2: Predict Closure Within 24 Hours
# ----------------------
X2 = merged[features]
y2 = merged['closed_within_24h']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

nb2 = GaussianNB()
nb2.fit(X_train2, y_train2)
preds2 = nb2.predict(X_test2)

print("\nModel 2: Predicting Closure Within 24 Hours\n")
print(classification_report(y_test2, preds2))


Random Forest

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Load datasets
complaints = pd.read_csv("D:\\Rekha\\Capstone\\Data\\Cleaned\\311_cleaned_final.csv", low_memory=False)
demographics = pd.read_csv("D:\\Rekha\\Capstone\\Data\\nyc_zip_income_pop_gender_2019_2022.csv")
demographics = demographics[demographics["Year"] == 2022]
demographics["ZIP"] = demographics["ZIP"].astype(str).str.zfill(5)

# Clean and preprocess
complaints['created_date'] = pd.to_datetime(complaints['created_date'], errors='coerce')
complaints['closed_date'] = pd.to_datetime(complaints['closed_date'], errors='coerce')
complaints = complaints.dropna(subset=['created_date', 'closed_date', 'incident_zip', 'complaint_type', 'open_data_channel_type'])
complaints['incident_zip'] = complaints['incident_zip'].astype(str).str.split('.').str[0].str.zfill(5)
complaints['hour'] = complaints['created_date'].dt.hour

# Merge demographic income
merged = complaints.merge(demographics[['ZIP', 'Median Household Income']], left_on='incident_zip', right_on='ZIP', how='left')
merged = merged.dropna(subset=['Median Household Income'])

# Define target variables
merged['is_smartphone'] = (merged['open_data_channel_type'].str.upper() == 'MOBILE').astype(int)
merged['closed_within_24h'] = ((merged['closed_date'] - merged['created_date']).dt.total_seconds() < 86400).astype(int)

# Encode complaint type
le = LabelEncoder()
merged['complaint_encoded'] = le.fit_transform(merged['complaint_type'])

# Feature columns
features = ['complaint_encoded', 'Median Household Income', 'hour']

### --- MODEL 1: Smartphone Complaints ---
X1 = merged[features]
y1 = merged['is_smartphone']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

rf1 = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf1.fit(X_train1, y_train1)
preds1 = rf1.predict(X_test1)

print("Smartphone Complaint Prediction:\n")
print(classification_report(y_test1, preds1, zero_division=0))

### --- MODEL 2: Closure Within 24 Hours ---
X2 = merged[features]
y2 = merged['closed_within_24h']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

rf2 = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf2.fit(X_train2, y_train2)
preds2 = rf2.predict(X_test2)

print("\n Complaint Closure Within 24 Hours Prediction:\n")
print(classification_report(y_test2, preds2, zero_division=0))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Confusion matrix
cm1 = confusion_matrix(y_test1, preds1)
labels = ['Not Smartphone', 'Smartphone']

plt.figure(figsize=(6, 5))
sns.heatmap(cm1, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix: Smartphone Complaint Prediction")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.savefig("D:\\Rekha\\Capstone\\Visualizations\\Figure 12.png", dpi=300)
plt.show()

# Confusion matrix
cm2 = confusion_matrix(y_test2, preds2)
labels = ['Not Closed in 24h', 'Closed in 24h']

plt.figure(figsize=(6, 5))
sns.heatmap(cm2, annot=True, fmt='d', cmap='Greens', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix: Closure Within 24 Hours")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.savefig("D:\\Rekha\\Capstone\\Visualizations\\Figure 13.png", dpi=300)
plt.show()

XGBoost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Load Data
complaints = pd.read_csv("D:\\Rekha\\Capstone\\Data\\Cleaned\\311_cleaned_final.csv", low_memory=False)
demographics = pd.read_csv("D:\\Rekha\\Capstone\\Data\\nyc_zip_income_pop_gender_2019_2022.csv")
demographics = demographics[demographics["Year"] == 2022]
demographics["ZIP"] = demographics["ZIP"].astype(str).str.zfill(5)

# Preprocessing
complaints['created_date'] = pd.to_datetime(complaints['created_date'], errors='coerce')
complaints['closed_date'] = pd.to_datetime(complaints['closed_date'], errors='coerce')
complaints = complaints.dropna(subset=['created_date', 'closed_date', 'incident_zip', 'complaint_type', 'open_data_channel_type'])

complaints['incident_zip'] = complaints['incident_zip'].astype(str).str.split('.').str[0].str.zfill(5)
complaints['hour'] = complaints['created_date'].dt.hour

# Merge income
merged = complaints.merge(demographics[['ZIP', 'Median Household Income']], left_on='incident_zip', right_on='ZIP', how='left')
merged = merged.dropna(subset=['Median Household Income'])

# Encode target variables
merged['is_smartphone'] = merged['open_data_channel_type'].str.upper().eq('MOBILE').astype(int)
merged['closed_within_24h'] = ((merged['closed_date'] - merged['created_date']).dt.total_seconds() < 86400).astype(int)

# Encode complaint type
le = LabelEncoder()
merged['complaint_encoded'] = le.fit_transform(merged['complaint_type'])

# Features to use
features = ['complaint_encoded', 'Median Household Income', 'hour']

# -----------------------------
# 🚀 Model 1: Smartphone Complaints
# -----------------------------
X1 = merged[features]
y1 = merged['is_smartphone']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

xgb1 = XGBClassifier(eval_metric='logloss', scale_pos_weight=3)
xgb1.fit(X_train1, y_train1)
preds1 = xgb1.predict(X_test1)

print("Smartphone Complaint Prediction:\n")
print(classification_report(y_test1, preds1))

# -----------------------------
# ⏱️ Model 2: Closure Within 24h
# -----------------------------
X2 = merged[features]
y2 = merged['closed_within_24h']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

xgb2 = XGBClassifier(eval_metric='logloss')
xgb2.fit(X_train2, y_train2)
preds2 = xgb2.predict(X_test2)

print("\n Closure Within 24 Hours Prediction:\n")
print(classification_report(y_test2, preds2))


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assume X, y are already defined
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss', scale_pos_weight=3)
}

# Store metrics
metrics_dict = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    report = classification_report(y_test, preds, output_dict=True, zero_division=0)
    print(name)
    print(report['weighted avg']['precision'])
    print(report['weighted avg']['recall'])
    print(report['weighted avg']['f1-score'])
    print(report['weighted avg']['support'])
    print(accuracy_score(y_test, preds))
    metrics_dict[name] = {
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1-score': report['weighted avg']['f1-score'],
        'support': report['weighted avg']['support'],
        'accuracy': accuracy_score(y_test, preds)
    }

# Create DataFrame
metrics_df = pd.DataFrame(metrics_dict).T

# Plot
metrics_df.drop(columns=['support']).plot(kind='bar', figsize=(10, 6))
plt.title('Model Comparison: Precision, Recall, F1, Accuracy')
plt.ylabel('Score')
plt.ylim(0, 1.1)
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.grid(True, axis='y')
plt.tight_layout()
plt.savefig("D:\\Rekha\\Capstone\\Visualizations\\Figure 14.png", dpi=300)
plt.show()
