In [1]:
# Import necessary libraries and modules
import os
import sys
import pandas as pd

sys.path.insert(0, os.path.dirname(os.getcwd()))
from scripts.logger import Logger
from scripts.data_cleaning import DataCleaner

# Suppress FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Initialize logger
logger = Logger('rossmann_analysis.log')

In [3]:
#  Load Data
logger.log('Loading data...')
train_data = pd.read_csv('../data/train.csv', low_memory=False, index_col=False)
test_data = pd.read_csv('../data/test.csv', low_memory= False, index_col=False)
store_data = pd.read_csv('../data/store.csv', low_memory= False, index_col=False)

logger.log('Data loaded successfully.')

In [4]:
logger.log("First 5 rows of the training data (train_data)")
train_data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [7]:
logger.log("First 5 rows of the testing data (test_data)")
test_data.head()

2024-09-20 16:49:03,138 - INFO - First 5 rows of the testing data (test_data)


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


In [8]:
logger.log("First 5 rows of the store data (store_data)")
store_data.head()

2024-09-20 16:49:35,534 - INFO - First 5 rows of the store data (store_data)


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


### Statistical analysis

In [11]:
logger.log("train_data.info()")
train_data.info()

2024-09-20 17:02:31,570 - INFO - train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   Store          1017209 non-null  int64 
 1   DayOfWeek      1017209 non-null  int64 
 2   Date           1017209 non-null  object
 3   Sales          1017209 non-null  int64 
 4   Customers      1017209 non-null  int64 
 5   Open           1017209 non-null  int64 
 6   Promo          1017209 non-null  int64 
 7   StateHoliday   1017209 non-null  object
 8   SchoolHoliday  1017209 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 69.8+ MB


In [12]:
logger.log("test_data.info()")
test_data.info()

2024-09-20 17:03:46,851 - INFO - test_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             41088 non-null  int64  
 1   Store          41088 non-null  int64  
 2   DayOfWeek      41088 non-null  int64  
 3   Date           41088 non-null  object 
 4   Open           41077 non-null  float64
 5   Promo          41088 non-null  int64  
 6   StateHoliday   41088 non-null  object 
 7   SchoolHoliday  41088 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 2.5+ MB


In [13]:
logger.log("store_data.info()")
store_data.info()

2024-09-20 17:03:53,839 - INFO - store_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


In [14]:
logger.log("Shape of data")
print ('shape of train data:' ,train_data.shape)
print ('shape of test data:' ,test_data.shape)
print('shape of store data:',store_data.shape)

2024-09-20 17:13:40,267 - INFO - Shape of data


shape of train data: (1017209, 9)
shape of test data: (41088, 8)
shape of store data: (1115, 10)


### Data Overview

- **Train Data Shape:** `((1017209, 9))`
- **Test Data Shape:** `(41088, 8)`
- **Store Data Shape:** `(1115, 10)`

#### Train Data Info
- Total Entries: `1,017,209`
- Columns: `9` (Store, DayOfWeek, Date, Sales, Customers, Open, Promo, StateHoliday, SchoolHoliday)
- Data Types: 
  - `int64`: 7 columns
  - `object`: 2 columns
- Non-Null Entries:
  - Most columns are complete except for the **Open** column, which has 11 missing values.

#### Test Data Info:
- Total Entries: `41,088`
- Columns: `8` (Id, Store, DayOfWeek, Date, Open, Promo, StateHoliday, SchoolHoliday)
- Data Types: 
  - `int64`: 5 columns
  - `float64`: 1 column
  - `object`: 2 columns
- Non-Null Entries:
  - all columns are complete, no missing values.

#### Store Data Info:
- Total Entries: `1,115`
- Columns: `10` (Store, StoreType, Assortment, CompetitionDistance, CompetitionOpenSinceMonth, etc.)
- Data Types:
  - `int64`: 2 columns
  - `float64`: 5 columns
  - `object`: 3 columns
- Missing Values:
  - **CompetitionDistance**: 3 missing values
  - **CompetitionOpenSinceMonth** & **Year**: Significant missing values (~350 entries)
  - **Promo2SinceWeek** & **Promo2SinceYear**: Around 544 missing entries


<bound method NDFrame.head of          Store  DayOfWeek        Date  Sales  Customers  Open  Promo  \
0            1          5  2015-07-31   5263        555     1      1   
1            2          5  2015-07-31   6064        625     1      1   
2            3          5  2015-07-31   8314        821     1      1   
3            4          5  2015-07-31  13995       1498     1      1   
4            5          5  2015-07-31   4822        559     1      1   
...        ...        ...         ...    ...        ...   ...    ...   
1017204   1111          2  2013-01-01      0          0     0      0   
1017205   1112          2  2013-01-01      0          0     0      0   
1017206   1113          2  2013-01-01      0          0     0      0   
1017207   1114          2  2013-01-01      0          0     0      0   
1017208   1115          2  2013-01-01      0          0     0      0   

        StateHoliday  SchoolHoliday  
0                  0              1  
1                  0         