NOTE: Dependencies for the project are listed below:
- scipy==1.2.1
- pandas==0.25.1 
- numpy==1.17.2
- matplotlib==3.1.2
- tqdm==4.41.0
- impyute==0.0.8

## Importing libraries

In [None]:
from scipy import stats

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

In [None]:
!pip install impyute

In [None]:
from impyute.imputation.cs import fast_knn

## Reading Data

In [None]:
working_data = pd.read_csv('../input/nielsenhackathon/Hackathon_Working_Data.csv')
validation_data = pd.read_csv('../input/nielsenhackathon/Hackathon_Validation_Data.csv')
mapping_file= pd.read_csv('../input/nielsenhackathon/Hackathon_Mapping_File.csv')
ideal_data = pd.read_csv('../input/nielsenhackathon/Hackathon_Ideal_Data.csv')
submit = pd.read_csv('../input/nielsenhackathon/Sample Submission.csv')

In [None]:
print('Working Data')
working_data.head()
print('Validation Data')
validation_data.head()
print('Ideal Data')
ideal_data.head()
submit.head()

## Idea derivation

In [None]:
ideal_data_grouped = ideal_data.groupby(['STORECODE', 'MONTH', 'GRP'])['VALUE'].sum().reset_index()

In [None]:
fig, ax = plt.subplots(3,1, sharex =True)

for i, month in enumerate(['M1','M2','M3']):
    df = ideal_data_grouped[(ideal_data_grouped['MONTH']==month) & (ideal_data_grouped['STORECODE']=='P1')].copy()
    ax[i].bar(df['GRP'], df['VALUE'], align='center')

In [None]:
fig, ax = plt.subplots(3,1, sharex =True, figsize=(12,8))
plt.subplots_adjust(hspace=0.8)
for i, month in enumerate(['M1','M2','M3']):
    df = ideal_data_grouped[(ideal_data_grouped['MONTH']==month) & (ideal_data_grouped['STORECODE']=='P2')].copy()
    _ =ax[i].bar(df['GRP'], df['VALUE'], align='center')
    _ =ax[i].set_xticklabels( '', rotation=90)
    _ =ax[i].set_ylabel('TOTALVALUE')
    _ =ax[i].set_xlabel('GRP')
    _ =ax[i].set_title(f'TOTALVALUE FOR DIFFERENT GRP FOR STORE P2 IN MONTH {month}')

In [None]:
fig, ax = plt.subplots(3,1, sharex =True)

for i, month in enumerate(['M1','M2','M3']):
    df = ideal_data_grouped[(ideal_data_grouped['MONTH']==month) & (ideal_data_grouped['STORECODE']=='P3')].copy()
    ax[i].bar(df['GRP'], df['VALUE'], align='center')

>Conclusion - Same store, same group, different month => Same value

## Some anomalous groups 

In [None]:
set(working_data['GRP'].unique()) - set(validation_data['GRP'].unique())

In [None]:
set(validation_data['GRP'].unique()) - set(working_data['GRP'].unique())

In [None]:
#Fixing the space in the working data group
working_data['GRP'] = working_data['GRP'].apply(lambda x: ' '.join(x.split()))

In [None]:
set(working_data['GRP'].unique()) - set(validation_data['GRP'].unique())

In [None]:
set(validation_data['GRP'].unique()) - set(working_data['GRP'].unique())

## Initialising Procedure

In [None]:
validation_data['key']=[1]*validation_data.shape[0]

In [None]:
list_of_days = list(range(1,32))

In [None]:
whole_incomplete_data = pd.merge(validation_data, pd.DataFrame({'key':[1]*31,'DAY':list_of_days}), on='key').drop('key',axis=1)

In [None]:
validation_data.drop('key',axis =1, inplace=True)

In [None]:
working_data_grp_grouped = working_data.groupby(['STORECODE','MONTH','GRP','DAY'])['VALUE'].sum().reset_index()

In [None]:
working_data_grp_grouped.head()

In [None]:
whole_partial_data = pd.merge(whole_incomplete_data, working_data_grp_grouped, on =['STORECODE','MONTH','GRP','DAY'], how='left')

## Algorithm Idea
### Using the observation `Same store, same group, different month => Same value`, derived assumption 2

### `Same store, same group, same day, different month => similar value`

In [None]:
whole_partial_data_table_2 = pd.pivot_table(whole_partial_data,values='VALUE',index =['STORECODE','MONTH','DAY'], columns='GRP',dropna=False).reset_index().set_index(['STORECODE','DAY','MONTH'])

In [None]:
whole_partial_data_table_2.head()

In [None]:
# Data null initially
whole_partial_data_table_2.isna().sum().sum()/ (whole_partial_data_table_2.shape[0] * whole_partial_data_table_2.shape[1])

In [None]:
whole_partial_data_table_2.loc[('N1',4)]

In [None]:
whole_partial_data_table_2.loc[('N1',4)].fillna(whole_partial_data_table_2.loc[('N1',4)].mean())

In [None]:
## algo3- Same store, same day, same group, different month - same data
for store in tqdm(list(map(lambda x: 'N'+str(x), range(1,11)))):
    for day in range(1,32):
        whole_partial_data_table_2.loc[(store,day)].fillna(whole_partial_data_table_2.loc[(store,day)].mean(), inplace=True)

In [None]:
whole_partial_data_table_2.head()

In [None]:
# Data null after first imputation step
whole_partial_data_table_2.isna().sum().sum()/ (whole_partial_data_table_2.shape[0] * whole_partial_data_table_2.shape[1])

In [None]:
algo3_baseline_data  = whole_partial_data_table_2.stack().reset_index().groupby(['STORECODE','MONTH','GRP'])[0].sum().reset_index()

In [None]:
algo3_baseline_data.rename(columns = {0:'TOTALVALUE'}, inplace=True)

In [None]:
algo3_baselined_submission = pd.merge(validation_data,algo3_baseline_data, how='left',on=['STORECODE','MONTH','GRP']).fillna(0).drop(['STORECODE','MONTH','GRP'], axis=1)
algo3_baselined_submission.columns = ['ID','TOTALVALUE']
algo3_baselined_submission['TOTALVALUE'] = algo3_baselined_submission['TOTALVALUE'].astype('int')

In [None]:
algo3_baselined_submission.to_csv('algo3_baseline_check.csv',index=False)

> Obtained Score - 2389.4291343469

## Improvement using Assumption 1

In [None]:
# Data null from intermediate stage before grouping
whole_partial_data_table_2.isna().sum().sum()/ (whole_partial_data_table_2.shape[0] * whole_partial_data_table_2.shape[1])

In [None]:
whole_partial_data_table_3 = whole_partial_data_table_2.reset_index().set_index(['STORECODE','MONTH','DAY'])

In [None]:
# Data null initially in new table
whole_partial_data_table_3.isna().sum().sum()/ (whole_partial_data_table_3.shape[0] * whole_partial_data_table_3.shape[1])

In [None]:
# with pd.option_context('max_rows',31):
#     whole_partial_data_table_3.loc[('N2', 'M1')]

In [None]:
# for i, x in whole_partial_data_table_3.loc[('N2', 'M1')].iterrows():
#     if any([not np.isnan(i) for i in x.values]):
#         whole_partial_data_table_3.loc[('N2', 'M1',i),:] = [0 if np.isnan(i) else i for i in x.values] 
# with pd.option_context('max_rows',31):
#     whole_partial_data_table_3.loc[('N2', 'M1')]

In [None]:
# whole_partial_data_table_3.loc[('N2', 'M1')] = fast_knn(whole_partial_data_table_3.loc[('N2', 'M1')].values, k =30)
# with pd.option_context('max_rows',31):
#     whole_partial_data_table_3.loc[('N2', 'M1')]

## Imputation using fast knn

In [None]:
for month in tqdm(['M1','M2','M3']):
    for store in tqdm(list(map(lambda x: 'N'+str(x), range(1,11)))):
        nan_present= False
        for i, x in whole_partial_data_table_3.loc[(store,month)].iterrows():
            if any([not np.isnan(i) for i in x.values]):
                whole_partial_data_table_3.loc[(store,month,i),:] = [0 if np.isnan(i) else i for i in x.values] 
                #replacing nan values with 0 if even one purchase was recorded,\
            else:
                nan_present = True
                #leaving the whole thing as it is. To be imputed later.
        if nan_present:
            print(f"Imputing in ({month, store})")
            #Mean of Whole data other than the outlier        
    #         df = whole_partial_data_table_3.loc[(store, month)].copy()
    #         whole_partial_data_table_3.loc[(store,month)].fillna(df[np.abs(df-df.mean()) <= (3*df.std())].mean(), inplace=True)
            #KNN
            whole_partial_data_table_3.loc[(store,month)] = fast_knn(whole_partial_data_table_3.loc[(store,month)].values, k =30)

In [None]:
# Data null finally in new table
whole_partial_data_table_3.isna().sum().sum()/ (whole_partial_data_table_3.shape[0] * whole_partial_data_table_3.shape[1])

In [None]:
whole_partial_data_table_3.head()

In [None]:
whole_complete_data_2 = pd.DataFrame(whole_partial_data_table_3.stack()).reset_index().rename(columns={0:'VALUE'})
whole_complete_data_2.head()

In [None]:
whole_complete_data_with_id = pd.merge(whole_partial_data, whole_complete_data_2, on = ['STORECODE','MONTH','DAY','GRP'], how = 'inner').drop('VALUE_x',axis=1)

In [None]:
whole_complete_data_with_id.head()

In [None]:
submit = whole_complete_data_with_id.groupby('ID')['VALUE_y'].sum().reset_index()
submit.columns = ['ID','TOTALVALUE']
submit['TOTALVALUE'] = submit['TOTALVALUE'].astype('int')
submit.head()

In [None]:
submit.to_csv('final_submission.csv',index=False)

> Obtained Score - 2359.0151807577