## Notebook Content

In this notebook, we will clean the data collected from  [__Trader Joe's__](https://www.traderjoes.com/home)

## Import Libraries

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sqlite3 as sql

### Read Data

In [11]:
df = pd.read_csv('TraderJoes_df.csv')

### Explore Data

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,URL,CALORY,TOTAL_FAT,SATURATED_FAT,TRANS_FAT,CHOLESTEROL,SODIUM,TOTAL_CARBOHYDRATE,DIETARY_FIBER,...,SUGARS_1G,SODUM,INCLUES,CALORIES_FROM_FAT,ADDED_SUGARSƗ,Ɨ_ONE__SERVING_ADDS_16G_OF_SUGAR_TO_YOUR_DIET_AND_REPRESENTS__32%_OF_THE_DAILY_VALUE_FOR_ADDED_SUGARS,SAT._FAT,TOTAL_CARB.,VIT._D,POTAS.
0,0,https://www.traderjoes.com/home/products/pdp/0...,160,5.0,0.0,0.0,20.0,370.0,24.0,0.0,...,,,,,,,,,,
1,0,https://www.traderjoes.com/home/products/pdp/0...,120,2.0,0.0,0.0,0.0,420.0,20.0,0.0,...,,,,,,,,,,
2,0,https://www.traderjoes.com/home/products/pdp/0...,70,4.5,2.5,0.0,10.0,600.0,8.0,0.0,...,,,,,,,,,,
3,0,https://www.traderjoes.com/home/products/pdp/0...,170,10.0,7.0,0.0,20.0,220.0,18.0,1.0,...,,,,,,,,,,
4,0,https://www.traderjoes.com/home/products/pdp/0...,120,8.0,4.5,0.0,25.0,150.0,11.0,0.0,...,,,,,,,,,,


In [13]:
df.shape

(415, 39)

In [14]:
# check about nulls 
df.isnull().sum().sort_values(ascending = False)

POTAS.                                                                                                   415
SUGARS_1G                                                                                                415
VIT._D                                                                                                   415
†_AS_PREPARED_WITH_UNSALTED_BUTTER_AND_SALT.                                                             415
Ɨ_ONE__SERVING_ADDS_16G_OF_SUGAR_TO_YOUR_DIET_AND_REPRESENTS__32%_OF_THE_DAILY_VALUE_FOR_ADDED_SUGARS    415
SODIM                                                                                                    414
SOLUBLE_FIBER                                                                                            414
INSOLUBLE_FIBER                                                                                          414
SODUM                                                                                                    414
INCLUES            

As we see, there is a lot of null values, so we must dealling with them 

#### Drop Columns

Due to the large numbers of null values in some columns, we will drop the columns with more than 400 null values

In [15]:
# Any column with > 400 null will be dropped
col_list_toDrop = ['†_AS_PREPARED_WITH_UNSALTED_BUTTER_AND_SALT.','POTAS.','VIT._D','SUGARS_1G',
                'Ɨ_ONE__SERVING_ADDS_16G_OF_SUGAR_TO_YOUR_DIET_AND_REPRESENTS__32%_OF_THE_DAILY_VALUE_FOR_ADDED_SUGARS',
                'INCLUES','SODIM','SOLUBLE_FIBER','INSOLUBLE_FIBER','SODUM','SAT._FAT',
                'TOTAL_CARB.','ADDED_SUGARSƗ','CALORIES','CALORIES_FROM_FAT',
                'VITAMIN_C','VITAMIN_A']

In [16]:
df.drop(col_list_toDrop, axis='columns', inplace=True)

In [17]:
#drop unnammed column 
df = df.loc[:,~df.columns.str.match("Unnamed:")]

### General Check About Data

In [18]:
df.head()

Unnamed: 0,URL,CALORY,TOTAL_FAT,SATURATED_FAT,TRANS_FAT,CHOLESTEROL,SODIUM,TOTAL_CARBOHYDRATE,DIETARY_FIBER,TOTAL_SUGARS,...,PROTEIN,VITAMIN_D,CALCIUM,IRON,POTASSIUM,FIBER,POLYUNSATURATED_FAT,MONOUNSATURATED_FAT,SUGARS,ADDED_SUGARS
0,https://www.traderjoes.com/home/products/pdp/0...,160,5.0,0.0,0.0,20.0,370.0,24.0,0.0,5.0,...,4.0,0.0,156.0,2.0,47.0,,,,,
1,https://www.traderjoes.com/home/products/pdp/0...,120,2.0,0.0,0.0,0.0,420.0,20.0,0.0,2.0,...,4.0,0.0,20.0,2.0,70.0,,,,,
2,https://www.traderjoes.com/home/products/pdp/0...,70,4.5,2.5,0.0,10.0,600.0,8.0,0.0,1.0,...,2.0,,,,,,,,,
3,https://www.traderjoes.com/home/products/pdp/0...,170,10.0,7.0,0.0,20.0,220.0,18.0,1.0,1.0,...,3.0,0.0,10.0,0.2,0.0,,,,,
4,https://www.traderjoes.com/home/products/pdp/0...,120,8.0,4.5,0.0,25.0,150.0,11.0,0.0,0.0,...,2.0,0.1,10.0,0.2,20.0,,,,,


In [19]:
df.tail()

Unnamed: 0,URL,CALORY,TOTAL_FAT,SATURATED_FAT,TRANS_FAT,CHOLESTEROL,SODIUM,TOTAL_CARBOHYDRATE,DIETARY_FIBER,TOTAL_SUGARS,...,PROTEIN,VITAMIN_D,CALCIUM,IRON,POTASSIUM,FIBER,POLYUNSATURATED_FAT,MONOUNSATURATED_FAT,SUGARS,ADDED_SUGARS
410,https://www.traderjoes.com/home/products/pdp/0...,80,0.0,0.0,0.0,0.0,220.0,17.0,1.0,1.0,...,1.0,0.0,10.0,0.1,60.0,,,,,
411,https://www.traderjoes.com/home/products/pdp/0...,170,17.0,11.0,0.0,0.0,520.0,6.0,1.0,1.0,...,0.0,0.0,20.0,0.7,140.0,,,,,
412,https://www.traderjoes.com/home/products/pdp/0...,500,28.0,8.0,0.0,125.0,670.0,22.0,2.0,26.0,...,24.0,0.0,63.0,0.0,141.0,,,,,
413,https://www.traderjoes.com/home/products/pdp/0...,120,7.0,0.5,0.0,0.0,25.0,11.0,1.0,5.0,...,3.0,0.0,20.0,2.4,210.0,,,,,
414,https://www.traderjoes.com/home/products/pdp/0...,25,1.5,1.0,0.0,0.0,220.0,1.0,1.0,0.0,...,1.0,0.0,40.0,1.4,40.0,,,,,0.0


In [20]:
df.sample(5)

Unnamed: 0,URL,CALORY,TOTAL_FAT,SATURATED_FAT,TRANS_FAT,CHOLESTEROL,SODIUM,TOTAL_CARBOHYDRATE,DIETARY_FIBER,TOTAL_SUGARS,...,PROTEIN,VITAMIN_D,CALCIUM,IRON,POTASSIUM,FIBER,POLYUNSATURATED_FAT,MONOUNSATURATED_FAT,SUGARS,ADDED_SUGARS
274,https://www.traderjoes.com/home/products/pdp/0...,100,2.0,0.0,0.0,0.0,790.0,18.0,3.0,8.0,...,2.0,0.0,90.0,1.2,510.0,,,,,0.0
104,https://www.traderjoes.com/home/products/pdp/0...,140,7.0,0.5,0.0,0.0,50.0,18.0,1.0,2.0,...,2.0,0.0,10.0,0.3,300.0,,,,,1.0
173,https://www.traderjoes.com/home/products/pdp/0...,150,8.0,4.0,0.0,15.0,340.0,16.0,3.0,5.0,...,4.0,0.0,40.0,1.6,30.0,,,,,
125,https://www.traderjoes.com/home/products/pdp/0...,210,9.0,0.5,0.0,0.0,125.0,30.0,1.0,17.0,...,2.0,,,,,,,,,
108,https://www.traderjoes.com/home/products/pdp/0...,80,6.0,4.5,0.0,25.0,80.0,1.0,0.0,1.0,...,5.0,,,,,,,,,


### Fill Nulls

In [21]:
#Check nulls to fill it
df.isnull().sum().sort_values(ascending = False)

SUGARS                 400
MONOUNSATURATED_FAT    396
POLYUNSATURATED_FAT    396
FIBER                  383
ADDED_SUGARS           317
INCLUDES               129
VITAMIN_D              102
IRON                    99
CALCIUM                 98
POTASSIUM               98
DIETARY_FIBER           49
TOTAL_SUGARS            31
CHOLESTEROL             18
SATURATED_FAT           16
TRANS_FAT               15
SODIUM                   1
TOTAL_CARBOHYDRATE       1
CALORY                   0
PROTEIN                  0
TOTAL_FAT                0
URL                      0
dtype: int64

In [22]:
#fill nulls with 0 
df.fillna(value=0,inplace = True)

In [16]:
#make sure about filling the nulls 
df.isnull().sum().sort_values(ascending = False)

URL                    0
PROTEIN                0
SUGARS                 0
MONOUNSATURATED_FAT    0
POLYUNSATURATED_FAT    0
FIBER                  0
POTASSIUM              0
IRON                   0
CALCIUM                0
VITAMIN_D              0
INCLUDES               0
CALORY                 0
TOTAL_SUGARS           0
DIETARY_FIBER          0
TOTAL_CARBOHYDRATE     0
SODIUM                 0
CHOLESTEROL            0
TRANS_FAT              0
SATURATED_FAT          0
TOTAL_FAT              0
ADDED_SUGARS           0
dtype: int64

### Add Nedded Columns

As we see above, there is __SUGARS__ , __ADDED_SUGARS__, and __INCLUDES__ columns which are all about the sugar. As a result, we can combine them into 1 column. Also, __FIBER__, and __DIETARY_FIBER__ columns meaning the same, so we will combine them into a single column.   

In [17]:
df['SUGARS'] = df['SUGARS'] + df['ADDED_SUGARS'] + df['TOTAL_SUGARS'] + df['INCLUDES']
df['FIBER'] = df['FIBER'] + df['DIETARY_FIBER']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SUGARS'] = df['SUGARS'] + df['ADDED_SUGARS'] + df['TOTAL_SUGARS'] + df['INCLUDES']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['FIBER'] = df['FIBER'] + df['DIETARY_FIBER']


In [18]:
#drop the combined columns 
df.drop(['ADDED_SUGARS','TOTAL_SUGARS','INCLUDES','DIETARY_FIBER'], axis='columns', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [20]:
df.shape

(415, 17)

In [21]:
df.head()

Unnamed: 0,URL,CALORY,TOTAL_FAT,SATURATED_FAT,TRANS_FAT,CHOLESTEROL,SODIUM,TOTAL_CARBOHYDRATE,PROTEIN,VITAMIN_D,CALCIUM,IRON,POTASSIUM,FIBER,POLYUNSATURATED_FAT,MONOUNSATURATED_FAT,SUGARS
0,https://www.traderjoes.com/home/products/pdp/0...,160,5.0,0.0,0.0,20.0,370.0,24.0,4.0,0.0,156.0,2.0,47.0,0.0,0.0,0.0,5.0
1,https://www.traderjoes.com/home/products/pdp/0...,120,2.0,0.0,0.0,0.0,420.0,20.0,4.0,0.0,20.0,2.0,70.0,0.0,0.0,0.0,2.0
2,https://www.traderjoes.com/home/products/pdp/0...,70,4.5,2.5,0.0,10.0,600.0,8.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,https://www.traderjoes.com/home/products/pdp/0...,170,10.0,7.0,0.0,20.0,220.0,18.0,3.0,0.0,10.0,0.2,0.0,1.0,0.0,0.0,1.0
4,https://www.traderjoes.com/home/products/pdp/0...,120,8.0,4.5,0.0,25.0,150.0,11.0,2.0,0.1,10.0,0.2,20.0,0.0,0.0,0.0,0.0


In [22]:
df.tail()

Unnamed: 0,URL,CALORY,TOTAL_FAT,SATURATED_FAT,TRANS_FAT,CHOLESTEROL,SODIUM,TOTAL_CARBOHYDRATE,PROTEIN,VITAMIN_D,CALCIUM,IRON,POTASSIUM,FIBER,POLYUNSATURATED_FAT,MONOUNSATURATED_FAT,SUGARS
410,https://www.traderjoes.com/home/products/pdp/0...,80,0.0,0.0,0.0,0.0,220.0,17.0,1.0,0.0,10.0,0.1,60.0,1.0,0.0,0.0,1.0
411,https://www.traderjoes.com/home/products/pdp/0...,170,17.0,11.0,0.0,0.0,520.0,6.0,0.0,0.0,20.0,0.7,140.0,1.0,0.0,0.0,1.0
412,https://www.traderjoes.com/home/products/pdp/0...,500,28.0,8.0,0.0,125.0,670.0,22.0,24.0,0.0,63.0,0.0,141.0,2.0,0.0,0.0,49.0
413,https://www.traderjoes.com/home/products/pdp/0...,120,7.0,0.5,0.0,0.0,25.0,11.0,3.0,0.0,20.0,2.4,210.0,1.0,0.0,0.0,8.0
414,https://www.traderjoes.com/home/products/pdp/0...,25,1.5,1.0,0.0,0.0,220.0,1.0,1.0,0.0,40.0,1.4,40.0,1.0,0.0,0.0,0.0


In [23]:
df.sample(5)

Unnamed: 0,URL,CALORY,TOTAL_FAT,SATURATED_FAT,TRANS_FAT,CHOLESTEROL,SODIUM,TOTAL_CARBOHYDRATE,PROTEIN,VITAMIN_D,CALCIUM,IRON,POTASSIUM,FIBER,POLYUNSATURATED_FAT,MONOUNSATURATED_FAT,SUGARS
381,https://www.traderjoes.com/home/products/pdp/0...,160,11.0,1.0,0.0,0.0,190.0,13.0,5.0,0.0,30.0,1.2,160.0,4.0,0.0,0.0,1.0
69,https://www.traderjoes.com/home/products/pdp/0...,390,27.0,8.0,0.0,25.0,790.0,35.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,45.0
292,https://www.traderjoes.com/home/products/pdp/0...,140,7.0,0.0,0.0,30.0,470.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
78,https://www.traderjoes.com/home/products/pdp/0...,70,2.5,0.0,0.0,0.0,40.0,10.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14.0
110,https://www.traderjoes.com/home/products/pdp/0...,100,3.0,0.0,0.0,0.0,160.0,10.0,9.0,0.0,50.0,2.2,250.0,6.0,0.0,0.0,0.0


### Add Data to DataBase 

In [24]:
conn = sql.connect('Cleaned_TraderJoes_df.db')

In [25]:
df.to_sql('Cleaned_TraderJoes_df', conn)