# Task for Today  

***

## Bakery Day of Sale Prediction  

Given *data about bakery sales*, let's try to predict whether a given day of sale is a **weekend** or not.

We will use a logistic regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
data = pd.read_csv('../input/bakery-sales/Bakery Sales.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop place column
    df = df.drop('place', axis=1)
    
    # Drop single-valued columns
    df = df.drop(['croque monsieur', 'mad garlic'], axis=1)
    
    # Drop rows without sales or date information
    missing_rows = df[df[['datetime', 'day of week', 'total']].isna().any(axis=1)].index
    df = df.drop(missing_rows, axis=0).reset_index(drop=True)
    
    # Fill remaining missing values with 0
    df = df.fillna(0)
    
    # Remove time information from datetime column
    df['datetime'] = df['datetime'].apply(lambda x: re.sub(r' \d+:\d+$', '', x))
    
    # Save a copy of the datetime and day of week columns
    day_mapping = df[['datetime', 'day of week']].copy()
    
    # Group by date
    df = df.groupby(by='datetime', as_index=False).sum()
    
    # Recreate day of week values from day_mapping
    df['day'] = df['datetime'].apply(lambda x: day_mapping[day_mapping['datetime'] == x].values[0][1])
    
    # Drop datetime column
    df = df.drop('datetime', axis=1)
    
    # Change day column to be is_weekend
    df['is_weekend'] = df['day'].apply(lambda x: 'Weekend' if x == 'Sat' or x == 'Sun' else 'Workday')
    df = df.drop('day', axis=1)
    
    # Split df into X and y
    y = df['is_weekend']
    X = df.drop('is_weekend', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train.value_counts()

# Training

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Results

In [None]:
print("Test Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))

In [None]:
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred, labels=['Workday', 'Weekend'])
clr = classification_report(y_test, y_pred, labels=['Workday', 'Weekend'])

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False)
plt.xticks(ticks=[0.5, 1.5], labels=['Workday', 'Weekend'])
plt.yticks(ticks=[0.5, 1.5], labels=['Workday', 'Weekend'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

print("Classification Report:\n----------------------\n", clr)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/fWSxa0sJfu4