## Importing libraris 

In [3]:
# Import analysis and visualization libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## Importing data

In [5]:
# Create Path
path=r'C:\Users\Oksana Stepanova\OneDrive\Документи\Instacart Basket Analysis'

In [6]:
# Import Instacart merged dataframe
df_instacart=pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'instacart_merged.pkl'))

In [7]:
# Check for imported data
df_instacart.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,spender_flag,median_days_prior_order,order_frequency_flag,gender,states,age,date_joined,number_dependants,family_status,income
0,2539329,1,1,2,8,,True,196,1,0,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,False,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,False,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423


In [8]:
df_instacart.shape

(32399732, 31)

In [9]:
# Check for the data type

In [10]:
df_instacart.dtypes

order_id                     int32
user_id                      int32
order_number                 int16
orders_day_of_week            int8
order_hour_of_day             int8
days_since_prior_order     float64
first_order                   bool
product_id                   int32
add_to_cart_order            int16
reordered                     int8
product_name                object
aisle_id                     int16
department_id                 int8
prices                     float64
price_range_loc             object
busiest_day                 object
busiest_slowest_days        object
busiest_period_of_day       object
max_order                    int16
loyalty_flag                object
average_spending           float64
spender_flag                object
median_days_prior_order    float64
order_frequency_flag        object
gender                      object
states                      object
age                           int8
date_joined                 object
number_dependants   

### Security implications. In the Instacart dataframe, customers' first and last names are considered personal data. The first and last name columns were removed in the previous exercise to reduce file size since we will not be using this data in our analysis. Another important reason to omit these columns is to prevent disclosure of personal information. Apart from the first and last name, there is no other data that could be considered PII.

## Step 3. Customers' spending habits by geographic areas

In [15]:
# Check for the variable in the 'state' column
df_instacart['states'].value_counts(dropna=False)

states
Pennsylvania            667007
California              659695
Rhode Island            656777
Georgia                 656249
New Mexico              654400
Arizona                 653864
North Carolina          651790
Oklahoma                651661
Alaska                  648451
Minnesota               647738
Massachusetts           646275
Wyoming                 644191
Virginia                641280
Missouri                640576
Texas                   640285
Colorado                639173
Maine                   638479
North Dakota            638391
Alabama                 637863
Kansas                  637418
Louisiana               637414
Delaware                636906
South Carolina          636677
Oregon                  636332
Arkansas                636070
Nevada                  636034
New York                635912
Montana                 635181
South Dakota            633649
Illinois                632928
Hawaii                  632786
Washington              632722
M

In [17]:
# Create the list of Northeast states
northeast=['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']

In [18]:
# Create the list of Midwest states
midwest=['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']

In [19]:
# Create the list of South states
south=['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']

In [20]:
# Create the list of West states
west=['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']

In [22]:
# Create a new column 'regions' and assign Northeast region
df_instacart.loc[df_instacart['states'].isin(northeast), 'regions']='Northeast'

In [23]:
# Assign Midwest flag in the new column 'regions'
df_instacart.loc[df_instacart['states'].isin(midwest), 'regions']='Midwest'

In [25]:
# Assign South region flag in the new column 'regions'
df_instacart.loc[df_instacart['states'].isin(south), 'regions']='South'

In [26]:
# Assign West region flag in the new column 'regions'
df_instacart.loc[df_instacart['states'].isin(west), 'regions']='West'

In [27]:
# Check for the frequency of values in the 'region' column
df_instacart['regions'].value_counts(dropna=False)

regions
South        10790096
West          8291679
Midwest       7596065
Northeast     5721892
Name: count, dtype: int64

In [28]:
df_instacart.shape

(32399732, 32)

In [30]:
# Check for variable in the 'spender flag' column
df_instacart['spender_flag'].value_counts(dropna=False)

spender_flag
Low spender     32280013
High spender      119719
Name: count, dtype: int64

In [34]:
# Create a cross table of High-, Low spenders by region 
spender_by_region=pd.crosstab(df_instacart['spender_flag'], df_instacart['regions'], dropna=False)

In [35]:
spender_by_region

regions,Midwest,Northeast,South,West
spender_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High spender,29265,18639,40577,31238
Low spender,7566800,5703253,10749519,8260441


In [38]:
spender_by_region2=pd.crosstab(df_instacart['regions'], df_instacart['spender_flag'], dropna=False).sort_values(by='High spender', ascending=False)

In [39]:
spender_by_region2

spender_flag,High spender,Low spender
regions,Unnamed: 1_level_1,Unnamed: 2_level_1
South,40577,10749519
West,31238,8260441
Midwest,29265,7566800
Northeast,18639,5703253


### Observations: More than 99% of customers are ranked as ‘low spenders’ which means that the average price of the products in their order is lower than 10. This trend is observed both globally and in the four regions. 

In [42]:
df_instacart['price_range_loc'].value_counts(dropna=False)

price_range_loc
Mid range product     21860860
Low range product     10126321
High range product      412551
Name: count, dtype: int64

In [43]:
# Create a cross table of price range by region 
price_range_by_region=pd.crosstab(df_instacart['regions'], df_instacart['price_range_loc'], dropna=False).sort_values(by='High range product', ascending=False)

In [44]:
price_range_by_region

price_range_loc,High range product,Low range product,Mid range product
regions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
South,137601,3368338,7284157
West,105934,2592736,5593009
Midwest,96658,2372304,5127103
Northeast,72358,1792943,3856591


### Observations: About 68% of customers purchase mid-range products whose price per item is above 5 but lower than or equal to 15. About 30% of clients select low-rage products with prices lower than or equal to 5. Far fewer customers buy high-range products when the price exceeds 15.

## Step 4. Creating an exclusion flag for low-activity customers 

In [47]:
# Check for values in 'max order' column
df_instacart['max_order'].value_counts(dropna=False)

max_order
99    1171076
8      811706
6      811237
9      810057
7      803838
       ...   
97      44949
98      44585
96      40449
2           6
1           5
Name: count, Length: 99, dtype: int64

In [48]:
# Create a new column 'activity_flag' and assign customers activity label 'Low activity' if max order number <5
df_instacart.loc[df_instacart['max_order']<5, 'activity_flag']='Low activity'

In [49]:
# Create a new column 'activity_flag' and assign customers activity label 'High activity' if max order number >=5
df_instacart.loc[df_instacart['max_order']>=5, 'activity_flag']='High activity'

In [50]:
# Check for values in the new column 'activity flag'
df_instacart['activity_flag'].value_counts(dropna=False)

activity_flag
High activity    30959687
Low activity      1440045
Name: count, dtype: int64

In [51]:
# Check for the number of rows and columns after adding 'activity flag' column
df_instacart.shape

(32399732, 33)

In [52]:
# Create a subset of Low activity customers
low_activity_customers=df_instacart.loc[df_instacart['activity_flag']=='Low activity']

In [53]:
# Check for the data in the subset of Low activity customers
low_activity_customers.shape

(1440045, 33)

In [56]:
low_activity_customers['max_order'].describe()

count    1.440045e+06
mean     3.523177e+00
std      4.994919e-01
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      4.000000e+00
Name: max_order, dtype: float64

In [57]:
# Create a subset of Hight activity customers
high_activity_customers=df_instacart.loc[df_instacart['activity_flag']=='High activity']

In [58]:
# Check for the data in the subset of High activity customers
high_activity_customers.shape

(30959687, 33)

In [59]:
high_activity_customers['max_order'].describe()

count    3.095969e+07
mean     3.442621e+01
std      2.489502e+01
min      5.000000e+00
25%      1.400000e+01
50%      2.800000e+01
75%      4.800000e+01
max      9.900000e+01
Name: max_order, dtype: float64

## Step 5. Customers profiling

In [62]:
df_instacart['age'].describe()

count    3.239973e+07
mean     4.946528e+01
std      1.848558e+01
min      1.800000e+01
25%      3.300000e+01
50%      4.900000e+01
75%      6.500000e+01
max      8.100000e+01
Name: age, dtype: float64

In [63]:
df_instacart['income'].describe()

count    3.239973e+07
mean     9.941685e+04
std      4.300669e+04
min      2.590300e+04
25%      6.699600e+04
50%      9.660800e+04
75%      1.278840e+05
max      5.939010e+05
Name: income, dtype: float64

In [64]:
df_instacart['number_dependants'].value_counts(dropna=False)

number_dependants
3    8133827
0    8096275
2    8089749
1    8079881
Name: count, dtype: int64

In [65]:
df_instacart['gender'].value_counts(dropna=False)

gender
Male      16312444
Female    16087288
Name: count, dtype: int64

In [66]:
df_instacart['family_status'].value_counts(dropna=False)

family_status
married                             22753055
single                               5324920
divorced/widowed                     2771355
living with parents and siblings     1550402
Name: count, dtype: int64