In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import logging

In [None]:
# 1. Logging Setup
logging.basicConfig(filename='store_sales_analysis.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

logging.info('Loading the dataset...')

In [None]:
# 2. Load Data
store_data = pd.read_csv('store.csv')
train_data = pd.read_csv('train.csv') 

In [None]:
# 3. Data Cleaning
logging.info('Handling missing data...')

# Check for missing values
missing_values = store_data.isnull().sum()
logging.info(f"Missing values in dataset:\n{missing_values}")

# Handling missing values - Example for 'CompetitionDistance'
store_data['CompetitionDistance'].fillna(store_data['CompetitionDistance'].median(), inplace=True)

# Handling missing values for competition open dates
store_data['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
store_data['CompetitionOpenSinceYear'].fillna(0, inplace=True)

# Handling Promo2 columns
store_data['Promo2SinceWeek'].fillna(0, inplace=True)
store_data['Promo2SinceYear'].fillna(0, inplace=True)
store_data['PromoInterval'].fillna('None', inplace=True)

In [None]:
# 4. Merge store and sales data (assuming they share a common column)
merged_data = pd.merge(train_data_data, store_data, on='Store')

In [None]:
# 5. Check for outliers in sales
logging.info('Checking for outliers in sales data...')
sns.boxplot(x=merged_data['Sales'])
plt.title('Outliers in Sales')
plt.show()

# Handling outliers: Remove sales = 0 (if needed)
merged_data = merged_data[merged_data['Sales'] > 0]

In [None]:
# 6. Check for distribution in promotions
logging.info('Checking promotion distribution between training and test sets...')
sns.countplot(x='Promo', data=merged_data)
plt.title('Promo Distribution in Dataset')
plt.show()

In [None]:
# 7. Sales behavior before, during, and after holidays
logging.info('Analyzing sales behavior during holidays...')
plt.figure(figsize=(12, 6))
sns.boxplot(x='StateHoliday', y='Sales', data=merged

In [None]:
#Plotting Weekly Sales
# Ensure the date column is in datetime format
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

# Extract the week from the date
merged_data['Week'] = merged_data['Date'].dt.isocalendar().week

# Group by week and calculate the total sales for each week
weekly_sales = merged_data.groupby('Week')['Sales'].sum().reset_index()

# Plot weekly sales
plt.figure(figsize=(10,6))
sns.lineplot(data=weekly_sales, x='Week', y='Sales')
plt.title('Weekly Sales')
plt.xlabel('Week')
plt.ylabel('Total Sales')
plt.show()

In [None]:
#Plotting Monthly Sales
# Extract the month from the date
merged_data['Month'] = merged_data['Date'].dt.month

# Group by month and calculate the total sales for each month
monthly_sales = merged_data.groupby('Month')['Sales'].sum().reset_index()

# Plot monthly sales
plt.figure(figsize=(10,6))
sns.lineplot(data=monthly_sales, x='Month', y='Sales')
plt.title('Monthly Sales')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.show()

Explanation of Seasonal Decomposition:
Trend: Shows the overall direction (upwards, downwards) over time.
Seasonal: Recurring patterns (e.g., sales spikes during holiday seasons).
Residual: The remainder after removing trend and seasonality, representing noise or random fluctuations.

In [None]:
#Seasonal Decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

# Set the Date column as the index for time series analysis
merged_data.set_index('Date', inplace=True)

# Resample the sales data by day to handle any missing dates
daily_sales = merged_data['Sales'].resample('D').sum()

# Perform seasonal decomposition
decomposition = seasonal_decompose(daily_sales, model='additive', period=365)  # period = 365 for yearly seasonality

# Plot the decomposition
decomposition.plot()
plt.show()

Insights from the Plots:
ACF: Helps to identify the presence of significant correlations at specific lag values. A high autocorrelation at a particular lag indicates that previous values at that lag are influencing current values.
PACF: Isolates the direct relationship between the time series and its lag, without considering the influence of intermediate lags.
These plots are essential for determining the optimal number of lags (p, d, q) when building models like ARIMA or other time-series models.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Load the dataset (assuming you've already loaded the 'store.csv' file)
df = pd.read_csv('/mnt/data/store.csv')

# Convert the 'Date' column to datetime if not already done
df['Date'] = pd.to_datetime(df['Date'])

# Set 'Date' as index for time series analysis
df.set_index('Date', inplace=True)

# Resample the data to daily sales (this assumes 'Sales' column exists in your dataset)
daily_sales = df['Sales'].resample('D').sum()

# Set up the figure size
plt.figure(figsize=(12,6))

#


In [None]:
#plot roling statistcis 
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('/mnt/data/store.csv')

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Set 'Date' as the index for time series analysis
df.set_index('Date', inplace=True)

# Ensure 'Sales' column is numeric
df['Sales'] = pd.to_numeric(df['Sales'], errors='coerce')

# Choose a window size (e.g., 7 days for weekly, 30 for monthly)
window_size = 30

# Calculate rolling mean and standard deviation
rolling_mean = df['Sales'].rolling(window=window_size).mean()
rolling_std = df['Sales'].rolling(window=window_size).std()

# Plot the rolling statistics
plt.figure(figsize=(12,6))
plt.plot(df['Sales'], color='blue', label='Original Sales')
plt.plot(rolling_mean, color='red', label=f'{window_size}-day Rolling Mean')
plt.plot(rolling_std, color='black', label=f'{window_size}-day Rolling Std')

# Add titles and labels
plt.title(f'Rolling Mean & Standard Deviation (Window Size = {window_size})')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend(loc='best')
plt.show()


In [None]:
#12-Month Rolling Mean
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('/mnt/data/store.csv')

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Set 'Date' as the index for time series analysis
df.set_index('Date', inplace=True)

# Ensure 'Sales' column is numeric
df['Sales'] = pd.to_numeric(df['Sales'], errors='coerce')

# Choose a window size for 12 months (approximately 12 * 30 days = 360 days)
window_size = 360  # 12 months of daily data

# Calculate rolling mean
rolling_mean_12m = df['Sales'].rolling(window=window_size).mean()

# Plot the original sales and 12-month rolling mean
plt.figure(figsize=(12,6))
plt.plot(df['Sales'], color='blue', label='Original Sales')
plt.plot(rolling_mean_12m, color='red', label='12-Month Rolling Mean')

# Add titles and labels
plt.title('12-Month Rolling Mean of Sales')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend(loc='best')
plt.show()


In [None]:
# 1. Data Preparation
# Assuming 'Date' is a string, convert it to datetime
sales_data['Date'] = pd.to_datetime(sales_data['Date'])

# 2. 12-Month Rolling Standard Deviation
sales_data['12M_Rolling_Std'] = sales_data['Sales'].rolling(window=12).std()



In [None]:
# 3. Average Sales by Day of the Week
sales_data['Day_of_Week'] = sales_data['Date'].dt.day_name()
avg_sales_by_day = sales_data.groupby('Day_of_Week')['Sales'].mean()



In [None]:
# 4. Holiday Column
sales_data['Holiday'] = sales_data['Date'].isin(holiday_dates)  # Replace holiday_dates with actual holiday dates



In [None]:
# 5. Sales Distribution: Holiday vs. Non-Holiday
plt.figure(figsize=(10, 5))
sns.histplot(sales_data['Sales'], hue=sales_data['Holiday'])
plt.title('Sales Distribution by Holiday')
plt.show()


In [None]:

# 6. Summary Statistics
print(sales_data[['Sales', 'Customers']].describe())


In [None]:

# 7. Holiday Effect
holiday_sales = sales_data[sales_data['Holiday'] == True]['Sales']
non_holiday_sales = sales_data[sales_data['Holiday'] == False]['Sales']
t_stat, p_value = ttest_ind(holiday_sales, non_holiday_sales)
print('T-statistic:', t_stat, 'P-value:', p_value)



In [None]:
# 8. Promo Effect Over Time

# Assuming 'Promotion' is a binary indicator
promo_sales = sales_data[sales_data['Promotion'] == 1]['Sales']
non_promo_sales = sales_data[sales_data['Promotion'] == 0]['Sales']

t_stat_promo, p_value_promo = ttest_ind(promo_sales, non_promo_sales)
print('Promo Effect T-statistic:', t_stat_promo, 'P-value:', p_value_promo)

# Visualize the effect over time
plt.figure(figsize=(12, 6))
sns.lineplot(x='Date', y='Sales', hue='Promotion', data=sales_data)
plt.title('Sales Over Time by Promotion')
plt.show()
# 9. Store Type Performance Over Time
if 'Store_Type' in sales_data.columns:
    plt.figure(figsize=(12, 6))
    sns.lineplot(x='Date', y='Sales', hue='Store_Type', data=sales_data)
    plt.title('Sales Over Time by Store Type')
    plt.show()
# 10. Sales vs. Customers Scatter Plot
plt.scatter(sales_data['Customers'], sales_data['Sales'])
plt.xlabel('Customers')
plt.ylabel('Sales')
plt.title('Sales vs. Customers')
plt.show()


In [None]:

# 11. Cumulative Sales Over Time
sales_data['Cumulative_Sales'] = sales_data['Sales'].cumsum()



In [None]:
# 12. Daily Sales Growth Rate
sales_data['Daily_Growth_Rate'] = sales_data['Sales'].pct_change()

# 13. Data Cleaning and Correlation Analysis
# sales_data['Sales'] = sales_data['Sales'].apply(lambda x: np.log(x) if x > 0 else 0)

# Calculate correlations
correlation_matrix = sales_data[['Sales', 'Customers', 'Promotion', 'Holiday']].corr()
print(correlation_matrix)

# Visualize correlations
sns.heatmap(correlation_matrix, annot=True, cmap='viridis')
plt.title('Correlation Matrix')
plt.show()