# 4.9: Visualization Part 1

## Contents:
1. Importing data
2. Wrangling data
3. Checking data
4. Combining data
5. Exporting data

### 1. Importing data

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [3]:
# creating path for data import

path = r'C:\Users\scott\Desktop\Data Analysis\CF Immersive\Python\Jan-2022 Instacart Basket Analysis'

In [4]:
# importing customers data

df_cust = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'customers.csv'))

In [5]:
# importing merged orders/products dataframe

df_ords_prods = pd.read_pickle(os.path.join(path,'02 Data', 'Prepared Data', 'orders_products_agg_Jan_20_2022.pkl'))

### 2. Wrangling data

In [5]:
# checking dimensions

df_cust.shape

(206209, 10)

In [6]:
# checking data types

df_cust.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [5]:
# checking output

df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
# dropping names of customers

df_cust = df_cust.drop(['First Name', 'Surnam'], axis = 1)

In [9]:
# checking output

df_cust.head()

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


### 3. Checking data

In [10]:
# checking numeric statistics

df_cust.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [14]:
# checking gender value counts

df_cust['Gender'].value_counts(dropna = False)

Male      104067
Female    102142
Name: Gender, dtype: int64

In [15]:
# checking state value counts

df_cust['STATE'].value_counts(dropna = False)

Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts           4043
Michigan                4043
New Jersey              4043
Kansas                  4043
South Dakota            4043
Minnesota               4043
Tennessee               4043
New York                4043
Washington              4043
Louisiana               4043
Montana       

In [16]:
# checking fam_status value counts

df_cust['fam_status'].value_counts(dropna = False)

married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: fam_status, dtype: int64

In [17]:
# checking for missing values

df_cust.isnull().sum()

user_id         0
Gender          0
STATE           0
Age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

#### No missing values

In [18]:
# checking for mixed-type data

for col in df_cust.columns.tolist():
    weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_cust[weird]) > 0:
        print (col)

#### No mixed-type data

In [19]:
# checking for full duplicates

df_dups = df_cust[df_cust.duplicated()]
df_dups

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income


#### No duplicates

### 4. Combining data

In [7]:
# dropping previous indicator columns


df_ords_prods = df_ords_prods.drop(['_merge', 'exists'], axis = 1)

In [8]:
# merging df_cust with df_ords_prods on 'user_id'

df_ords_prods_cust = df_ords_prods.merge(df_cust, on = 'user_id', indicator = 'cust_merge')

In [9]:
# checking value counts

df_ords_prods_cust['cust_merge'].value_counts(dropna = False)

both          32404859
left_only            0
right_only           0
Name: cust_merge, dtype: int64

### 5. Exporting data

In [10]:
# exporting new dataframe as .pkl due to its large size

df_ords_prods_cust.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_cust_Jan_21_2022.pkl'))