# Introduction
[Add details. This is a new notebook template.]

# Set Up

## Authorize Google Drive
Follow pop up prompts to authorize Drive access. May not work with non-Chrome browsers depending on ad block and privacy settings.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Library imports

In [None]:
#general analysis
import pandas as pd
import pprint as ppr
import re
import numpy as np

#file management
from pathlib import Path
from datetime import datetime

#stop words counter
#from collections import Counter

## Display Preferences

In [None]:
#current preferences
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_colwidth', None) #change column display width
#pd.set_option('display.precision', 2) #displays 2 decimal places on all numbers
pd.set_option('display.float_format',  '{:.2f}'.format)
pd.set_option('display.memory_usage', 'deep')

# File Handling
This section uses parametized forms in Google Colab to simplify file selection.

It may require running the same cells multiple times depending on how much information is needed to select the intended file/directory.

In [None]:
#@title ## File Selection

#@markdown ---
#@markdown ### Select or enter each part of the filepath:
project = "dunnhumby" #@param ["project_01", "project_02"] {allow-input: true}
folders = "data/raw" #@param ["data/raw", "data/interim", "data/processed", "data/meta"] {allow-input: true}
#@markdown Check box if selecting 1 specific file.  Leave blank if selecting multiple files from a directory.
single_file = True #@param {type:"boolean"}
#@markdown **Required**: Selects a specific file.  Also used to create name of outputs.
file_name = "transactions_200701" #@param {type:"string"}
file_ext = ".csv" #@param [".csv", ".txt"] {allow-input: true}
#@markdown
#@markdown Use if selecting specific matching files from a directory.
file_pattern = "transactions_" #@param [""]{allow-input: true}
#@markdown ---

In [None]:
#main project path
project_dir = Path.cwd().joinpath("drive", "MyDrive", "data_analysis", project)
project_dir

PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby')

In [None]:
#input files path
input_dir = project_dir.joinpath(folders)
input_dir

PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw')

In [None]:
#output files path
output_dir = project_dir.joinpath("notebooks", "eda")
output_dir

PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/notebooks/eda')

In [None]:
#unique marker for new files
today = datetime.today()

### Show files in the selected path
This can be adjusted using a glob pattern. Leaving the pattern blank defaults to `*` and returns all items within the directory.

In [None]:
#show files in selected path
#default '*' returns all files
list(input_dir.glob(pattern=file_pattern+"*"))

[PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200607.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200608.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200609.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200610.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200611.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200612.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200613.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200614.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200615.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200616.csv'),
 PosixPath('/content/drive/MyDrive/data_analysis/d

### create final read-in path
this conditional statement selects the correct path based on the `single_file` checkbox in the form.

In [None]:
#select file path for a single file or directory
if single_file == True:
    file_path = input_dir.joinpath(file_name+file_ext)
else:
    file_path = input_dir.glob(pattern=file_pattern+"*")
file_path

PosixPath('/content/drive/MyDrive/data_analysis/dunnhumby/data/raw/transactions_200701.csv')

### Read into pandas dataframe

In [None]:
df = pd.read_csv(file_path,
#                        usecols= cols,
#                      sep='\t',
#                        nrows=100,
#                       engine='python',
#                     encoding='ISO-8859-1'
                        )

# DataFrame Overview

## Row and Column Count

In [None]:
df.shape

(277100, 22)

## `.info()`

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277100 entries, 0 to 277099
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   SHOP_WEEK                 277100 non-null  int64  
 1   SHOP_DATE                 277100 non-null  int64  
 2   SHOP_WEEKDAY              277100 non-null  int64  
 3   SHOP_HOUR                 277100 non-null  int64  
 4   QUANTITY                  277100 non-null  int64  
 5   SPEND                     277100 non-null  float64
 6   PROD_CODE                 277100 non-null  object 
 7   PROD_CODE_10              277100 non-null  object 
 8   PROD_CODE_20              277100 non-null  object 
 9   PROD_CODE_30              277100 non-null  object 
 10  PROD_CODE_40              277100 non-null  object 
 11  CUST_CODE                 226383 non-null  object 
 12  CUST_PRICE_SENSITIVITY    226383 non-null  object 
 13  CUST_LIFESTAGE            198732 non-null  o

In [None]:
df.head(5)

Unnamed: 0,SHOP_WEEK,SHOP_DATE,SHOP_WEEKDAY,SHOP_HOUR,QUANTITY,SPEND,PROD_CODE,PROD_CODE_10,PROD_CODE_20,PROD_CODE_30,PROD_CODE_40,CUST_CODE,CUST_PRICE_SENSITIVITY,CUST_LIFESTAGE,BASKET_ID,BASKET_SIZE,BASKET_PRICE_SENSITIVITY,BASKET_TYPE,BASKET_DOMINANT_MISSION,STORE_CODE,STORE_FORMAT,STORE_REGION
0,200701,20070304,1,17,1,1.13,PRD0900013,CL00015,DEP00004,G00003,D00001,CUST0000361701,MM,YA,994104700394636,L,MM,Full Shop,Mixed,STORE00001,LS,E02
1,200701,20070301,5,19,1,1.1,PRD0900015,CL00015,DEP00004,G00003,D00001,CUST0000871730,UM,,994104700728646,S,MM,Small Shop,Fresh,STORE00001,LS,E02
2,200701,20070303,7,12,1,1.0,PRD0900015,CL00015,DEP00004,G00003,D00001,CUST0000949903,MM,PE,994104700780122,M,UM,Top Up,Fresh,STORE00001,LS,E02
3,200701,20070303,7,15,3,4.68,PRD0900049,CL00160,DEP00054,G00016,D00003,CUST0000644893,LA,PE,994104700579780,L,LA,Top Up,Mixed,STORE00001,LS,E02
4,200701,20070302,6,14,1,1.04,PRD0900055,CL00230,DEP00081,G00027,D00008,CUST0000926111,UM,OT,994104700764453,L,UM,Top Up,Fresh,STORE00001,LS,E02
5,200701,20070304,1,14,1,1.6,PRD0900062,CL00175,DEP00059,G00017,D00004,CUST0000605487,LA,YF,994104700553719,M,MM,Top Up,Mixed,STORE00001,LS,E02
6,200701,20070301,5,21,1,2.36,PRD0900071,CL00086,DEP00024,G00007,D00002,CUST0000666576,MM,YA,994104700593739,L,MM,Full Shop,Mixed,STORE00001,LS,E02
7,200701,20070304,1,12,1,1.05,PRD0900077,CL00150,DEP00052,G00015,D00003,CUST0000710863,LA,YF,994104700622917,L,MM,Full Shop,Mixed,STORE00001,LS,E02
8,200701,20070304,1,12,1,1.05,PRD0900077,CL00150,DEP00052,G00015,D00003,CUST0000795333,MM,,994104700678351,L,MM,Full Shop,Mixed,STORE00001,LS,E02
9,200701,20070304,1,12,3,3.72,PRD0900086,CL00067,DEP00019,G00007,D00002,CUST0000710863,LA,YF,994104700622917,L,MM,Full Shop,Mixed,STORE00001,LS,E02
