In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import joblib

# Load the dataset
try:
    df = pd.read_csv('street_light_data_shuffled.csv')
except FileNotFoundError:
    print("Error: 'street_light_data_shuffled.csv' not found.")
    print("Please make sure the CSV file is in the same directory as the script.")
    exit()

# --- Feature Engineering ---
# The 'ambience_lux' is the primary feature.
# We'll use this single feature to predict 'light_needed'.
X = df[['ambience_lux']]

# --- Target Variable Preparation ---
# Convert the 'light_needed' column from text ('yes'/'no') to numbers (1/0).
# This is necessary for the logistic regression model.
y = df['light_needed'].map({'yes': 1, 'no': 0})

# --- Data Splitting ---
# Split the data into a training set (70%) and a testing set (30%).
# random_state ensures that we get the same split every time we run the code,
# making our results reproducible.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print(f"Data successfully split:")
print(f" - Training set size: {len(X_train)} samples")
print(f" - Testing set size: {len(X_test)} samples")
print("-" * 30)

# --- Model Training ---
# Initialize the Logistic Regression model.
model = LogisticRegression()

# Train the model using our training data.
print("Training the logistic regression model...")
model.fit(X_train, y_train)
print("Model training complete.")
print("-" * 30)


# --- Save the Model ---
# Save the trained model to a file for later use.
model_filename = 'street_light_model.pkl'
joblib.dump(model, model_filename)
print(f"Model has been exported to '{model_filename}'")
print("-" * 30)


# --- Model Evaluation ---
# Use the trained model to make predictions on the unseen test data.
print("Evaluating the model on the test set...")
y_pred = model.predict(X_test)

# Calculate the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("-" * 30)

# --- Detailed Performance Report ---
# Print a more detailed classification report.
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Light Needed', 'Light Needed']))
print("-" * 30)

# --- Visualization of Results ---
# Create a confusion matrix to visualize performance.
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Light Needed', 'Light Needed'],
            yticklabels=['No Light Needed', 'Light Needed'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('confusion_matrix.png')
print("A confusion matrix visualization has been saved as 'confusion_matrix.png'.")

# Plot the logistic regression curve
plt.figure(figsize=(10, 7))
# Scatter plot for the actual data points
sns.scatterplot(x=X_test['ambience_lux'], y=y_test, label='Actual Data', alpha=0.6)

# Generate a range of values for plotting the curve
X_range = np.linspace(X_test['ambience_lux'].min(), X_test['ambience_lux'].max(), 300).reshape(-1, 1)
# Get the predicted probabilities for the range
y_proba = model.predict_proba(X_range)[:, 1]

# Plot the logistic curve
plt.plot(X_range, y_proba, color='red', linewidth=2, label='Logistic Regression Curve')
plt.title('Logistic Regression Decision Boundary')
plt.xlabel('Ambience (Lux)')
plt.ylabel('Probability of Needing Light')
plt.legend()
plt.grid(True)
plt.savefig('logistic_curve.png')

print("A plot of the logistic curve has been saved as 'logistic_curve.png'.")

