In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

# Importing Data

In [2]:
# Path Creation
path=r'C:\Users\Drew\Instacart Basket Analysis'

In [3]:
# Orders Data
df_ords=pd.read_csv(os.path.join(path,'02 Data','Original Data','orders.csv'),index_col=False)

In [4]:
# Products Data
df_prods=pd.read_csv(os.path.join(path,'02 Data','Original Data','products.csv'),index_col=False)

In [5]:
# Departments Data
df_dep=pd.read_csv(os.path.join(path,'02 Data','Original Data','departments.csv'),index_col=False)

# Data Consistency Checks

In [6]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


# Mixed-Type Data

In [7]:
# mixed-type column is a column that includes both string values and numeric values

In [8]:
# Create a dataframe
df_test=pd.DataFrame()

In [9]:
# Create a mixed type column
df_test['mix']=['a','b',1,True]

In [10]:
# The first command, df_test = pd.DataFrame(), creates a new dataframe called df_test. 
# The second command, df_test['mix'] = ['a', 'b', 1, True], creates a new column, mix, within df_test and fills it with numeric, string, and boolean values.

In [11]:
# Newly created test dataframe 
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [12]:
# function for checking whether a dataframe contains any mixed-type columns is as follows:
for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [13]:
# How to fix mix-type data.
# first step is deciding what single data type the column in question should be. 
#If your column contained mostly names, for instance, it should be a string. 
# If it contained mostly order numbers, it should be a numeric
df_test['mix']=df_test['mix'].astype('str')

In [14]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   mix     4 non-null      object
dtypes: object(1)
memory usage: 164.0+ bytes


In [15]:
df_test['mix'].dtype

dtype('O')

# Missing Values

In [16]:
# missing values can occur for two reasons: 1) data corruption, or 2) they were never recorded in the first place.

In [17]:
# Finding Missing Values
# isnull() function is used to find missing observations, with “observations” here referring to entries in your dataframe.
# use the isnull() function by itself, it would return a value of True or False, which, by itself, isn’t very helpful. 
# You need to know how many total missing observations there are, which is where the sum() function comes in.
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [18]:
# Creating Subset to acutally view the missing values instead of just counting how many
# setting up the new dataframe to contain only those missing value using isnull()=True
df_prods_nan=df_prods[df_prods['product_name'].isnull()==True]

In [19]:
df_prods_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


## Addressing Missing Values

In [20]:
# There are a few ways to deal with missing data:
# 1.Create a new variable that acts like a flag based on the missing value.
# 2.Impute the value with the mean or median of the column (if the variable is numeric).
# 3.Remove or filter out the missing data.

In [21]:
# Template examples
# df['column with missings'].fillna(mean value, inplace=True) --for mean
# df['column with missings'].fillna(median value, inplace=True) --for median

In [22]:
# seeing original data row and column information before removing NAN values
df_prods.shape

(49693, 5)

In [23]:
# removing missing value rows by creating new dataframe subset
df_prods_clean=df_prods[df_prods['product_name'].isnull()==False]

In [24]:
# checking new number of rows and columns to compare to original dataframe
df_prods_clean.shape

(49677, 5)

In [25]:
# method 2 that drops all missising values from the original dataframe
df_prods.dropna(inplace=True)

In [26]:
# method 3 that drops only the NaNs from particular column
df_prods.dropna(subset=['product_name'],inplace=True)

# Duplicates

In [27]:
# Finding Duplicates
# creates a new subset of df_prods_clean—df_dups—containing only rows that are duplicates. The duplicated() function is what identifies duplicate rows. 
df_dups=df_prods_clean[df_prods_clean.duplicated()]

In [28]:
# showing found duplicates of the created subset above
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [29]:
# Addressing Duplicates
# Duplicates have been found and need to be deleted. 
# df.drop_duplciates

In [30]:
# Checking number of rows and columns before removing duplicates
df_prods_clean.shape

(49677, 5)

In [31]:
# Creating new dataframe that does not include duplciates
df_prods_clean_no_dups=df_prods_clean.drop_duplicates()

In [32]:
# Confirming duplciates deleted by comparing new rows and columns to previously found (49677,5) of original data df_prods_clean.
df_prods_clean_no_dups.shape

(49672, 5)

In [33]:
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


# Exporting Dataframe

In [34]:
# Exporting df_prods_clean_no_dups dataframe
df_prods_clean_no_dups.to_csv(os.path.join(path,'02 Data','Prepared Data','products_checked.csv'))

# Task 4.5

## 2.Run the df.describe() function on your df_ords dataframe.Using your new knowledge about how to interpret the output of this function, share in a markdown cell whether anything about the data looks off or should be investigated further.

In [35]:
# Running statistical overview of columns
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


## days_since_prior_order: Shows a value of "0" and a lower count which could be indicating same day orders or possibly missing data and should be worth investigating further to see if "0" values are viable information or improper data. With 30 being the max, shows a max cap of prior order tracking. With a mean of 11.11 and a standard deviation of 9.21, it shows wide variation between the times of consecutive orders.
## order_number: Has a range from 1 to 100 shows customer have placed up to 100 orders. Based on the mean of 17.15, median of 1.1 and standard deviation of 17.73, shows a wide distribution of orders per customer. Also, more investigation into the high standard deviation could be userful to investigate reasons, such as due to user segments or other influencing factors.
## order_hour_of_day: Shows a value range of 0 to 23 to represent the hours in a single day for when orders are placed. The mean shows orders are placed in the afternoon around 13.45. The stanadard deviation of 4.23 shows orders are evenly distributed throughout the day, but thanks to the mean we can see order tend to show favor for the afternoon hours. Would be worth investigating peak order hours to better undertand customers and find potential peak hours or slow downs.
## order_dow: The range shows to be within expected 0 to 6. with a mean of 2.78 and a standard deviation of 2.05, it shows orders tend to revolve around a Tuesday or Wednesday of the week. Similar to order_hour_of_day, more investigation could benefit to help better understand customer behavior.

# 3.Check for mixed-type data in your df_ords dataframe.

In [36]:
# Running check to find any columns with mixed data types
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [37]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int64  
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB


In [38]:
df_ords.dtypes

order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

# 4.If you find mixed-type data, fix it. The column in question should contain observations of a single data type.

In [39]:
# No mixed data types found

# 5.Run a check for missing values in your df_ords dataframe..

In [40]:
# Running check to see which column name shows to have missing values.
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

## Answer: days_since_prior_order shows to have 206209 missing values

# 6.Address the missing values using an appropriate method.

In [41]:
# Creating subset of missing values in days_since_prior_order column
df_ords_nan=df_ords[df_ords['days_since_prior_order'].isnull()==True]

In [42]:
# Running created subset to show those rows with missing value in the days_since_prior_order column
df_ords_nan

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
11,2168274,2,prior,1,2,11,
26,1374495,3,prior,1,1,14,
39,3343014,4,prior,1,6,11,
45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...
3420930,969311,206205,prior,1,4,12,
3420934,3189322,206206,prior,1,3,18,
3421002,2166133,206207,prior,1,6,19,
3421019,2227043,206208,prior,1,1,15,


In [43]:
# Using df_ords_first as subset to see if all the total number of order_number with 1 matches the missing value of df_ords_nan 
# This is to confirm these are all first orders which would be the reason for missing values in days_since_prior_order.
df_ords_first=df_ords_nan[df_ords_nan['order_number']==1]

In [44]:
df_ords_first

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
11,2168274,2,prior,1,2,11,
26,1374495,3,prior,1,1,14,
39,3343014,4,prior,1,6,11,
45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...
3420930,969311,206205,prior,1,4,12,
3420934,3189322,206206,prior,1,3,18,
3421002,2166133,206207,prior,1,6,19,
3421019,2227043,206208,prior,1,1,15,


## Answer: Both df_ords_nan and df_ords_first match 206209 rows leading my belief that missing values are due no prior orders and are fine to leave in dataframe.

# 7.Run a check for duplicate values in your df_ords data.

In [45]:
# Creating subset to find duplicate data
df_ords_dups=df_ords[df_ords.duplicated()]

In [46]:
# Running created subset to see if any duplicates appear
df_ords_dups

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


# 8.Address the duplicates using an appropriate method.

## No duplicates found in the created df_ords_dups dataframe. No duplicates needing to be addressed.

## 9. Export Dataframe to Prepared Data folder

In [47]:
# Exporting df_prods_clean_no_dups dataframe
df_prods_clean_no_dups.to_csv(os.path.join(path,'02 Data','Prepared Data','products_checked.csv'))

In [48]:
# Exporting cleaned df_ords datafame. 
df_ords.to_csv(os.path.join(path,'02 Data','Prepared Data','orders_cleaned.csv'))

In [50]:
df_ords_clean=pd.read_csv(os.path.join(path,'02 Data','Prepared Data','orders_cleaned.csv'),index_col=False)

In [52]:
df_ords_clean.shape

(3421083, 8)