# 🧠 Sales Forecasting Project
This notebook cleans and models sales data using linear regression to forecast future sales.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("sales_data_dirty.csv")
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# Data Cleaning
df['Promo'].fillna(0, inplace=True)
df['Customers'].fillna(df['Customers'].mean(), inplace=True)
df['Sales'].fillna(df['Sales'].mean(), inplace=True)
df['Store'].fillna('Unknown', inplace=True)
df['Weather'].fillna('unknown', inplace=True)
df['Day_Type'].fillna('unknown', inplace=True)

# Feature Engineering
df['Weekday'] = df['Date'].dt.weekday
df['Store_Code'] = df['Store'].astype('category').cat.codes
df['Weather_Code'] = df['Weather'].astype('category').cat.codes
df['DayType_Code'] = df['Day_Type'].astype('category').cat.codes

In [None]:
# Modeling
X = df[['Customers', 'Promo', 'Weekday', 'Store_Code', 'Weather_Code', 'DayType_Code']]
y = df['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Forecasting next 90 days
last_date = df['Date'].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=90)
future_df = pd.DataFrame({'Date': future_dates})
future_df['Store'] = 'A'
future_df['Store_Code'] = df[df['Store'] == 'A']['Store_Code'].iloc[0]
future_df['Promo'] = np.random.choice([0, 1], size=90, p=[0.4, 0.6])
future_df['Customers'] = np.random.normal(loc=df['Customers'].mean(), scale=50, size=90).astype(int)
future_df['Weather'] = np.random.choice(['sunny', 'rainy', 'snowy'], size=90, p=[0.6, 0.3, 0.1])
weather_map = dict(zip(df['Weather'].unique(), df['Weather_Code'].unique()))
future_df['Weather_Code'] = future_df['Weather'].map(weather_map)
future_df['Weekday'] = future_df['Date'].dt.weekday
future_df['Day_Type'] = future_df['Weekday'].map(lambda x: 'weekend' if x >= 5 else 'weekday')
daytype_map = dict(zip(df['Day_Type'].unique(), df['DayType_Code'].unique()))
future_df['DayType_Code'] = future_df['Day_Type'].map(daytype_map)

X_future = future_df[['Customers', 'Promo', 'Weekday', 'Store_Code', 'Weather_Code', 'DayType_Code']]
future_df['Predicted_Sales'] = model.predict(X_future)

In [None]:
# Plot forecast
plt.figure(figsize=(12, 5))
plt.plot(future_df['Date'], future_df['Predicted_Sales'], marker='o', color='orange')
plt.title('Daily Sales Forecast for the Next 3 Months')
plt.xlabel('Date')
plt.ylabel('Forecasted Sales')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()