<a href="https://colab.research.google.com/github/stefsoliveira/InteliCipes/blob/pre-processing/Pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset from the google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing the tables

In [None]:
data = pd.read_csv('/content/drive/My Drive/InteliCipes/RAW_recipes.csv')

# Querying null values

In [None]:
print(data.isnull().any())

name               True
id                False
minutes           False
contributor_id    False
submitted         False
tags              False
nutrition         False
n_steps           False
steps             False
description        True
ingredients       False
n_ingredients     False
dtype: bool


# Removing null values

In [None]:
data.dropna(axis=0, inplace=True)

In [None]:
data.isnull().any()

name              False
id                False
minutes           False
contributor_id    False
submitted         False
tags              False
nutrition         False
n_steps           False
steps             False
description       False
ingredients       False
n_ingredients     False
dtype: bool

# Removing columns that will not serve

In [None]:
data=data.drop(columns=['contributor_id'])
print(data.columns)

Index(['name', 'id', 'minutes', 'submitted', 'tags', 'nutrition', 'n_steps',
       'steps', 'description', 'ingredients', 'n_ingredients'],
      dtype='object')


# Rename column

In [None]:
data.rename(columns={'submitted': 'submitted_recipe'}, inplace=True)
print(data.columns)

Index(['name', 'id', 'minutes', 'submitted_recipe', 'tags', 'nutrition',
       'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients'],
      dtype='object')


# Indexing

In [None]:
data.index

Int64Index([     0,      1,      2,      3,      4,      6,      7,      9,
                10,     11,
            ...
            231627, 231628, 231629, 231630, 231631, 231632, 231633, 231634,
            231635, 231636],
           dtype='int64', length=226657)

# Remove spaces to the right

In [None]:
data.name.str.rstrip().head()

0    arriba   baked winter squash mexican style
1              a bit different  breakfast pizza
2                     all in the kitchen  chili
3                            alouette  potatoes
4            amish  tomato ketchup  for canning
Name: name, dtype: object

In [None]:
data.tags.str.rstrip().head()

0    ['60-minutes-or-less', 'time-to-make', 'course...
1    ['30-minutes-or-less', 'time-to-make', 'course...
2    ['time-to-make', 'course', 'preparation', 'mai...
3    ['60-minutes-or-less', 'time-to-make', 'course...
4    ['weeknight', 'time-to-make', 'course', 'main-...
Name: tags, dtype: object

In [None]:
data.steps.str.rstrip().head()

0    ['make a choice and proceed with recipe', 'dep...
1    ['preheat oven to 425 degrees f', 'press dough...
2    ['brown ground beef in large pot', 'add choppe...
3    ['place potatoes in a large pot of lightly sal...
4    ['mix all ingredients& boil for 2 1 / 2 hours ...
Name: steps, dtype: object

In [None]:
data.description.str.rstrip().head()

0    autumn is my favorite time of year to cook! th...
1    this recipe calls for the crust to be prebaked...
2    this modified version of 'mom's' chili was a h...
3    this is a super easy, great tasting, make ahea...
4    my dh's amish mother raised him on this recipe...
Name: description, dtype: object

# Remove parentheses and brackets

In [None]:
def remove_parenteses(item):
    if '(' in item:
        return item.replace('(','').replace(')','')
    else:
        return item
    
data.name.head().apply(remove_parenteses)

0    arriba   baked winter squash mexican style
1              a bit different  breakfast pizza
2                     all in the kitchen  chili
3                            alouette  potatoes
4            amish  tomato ketchup  for canning
Name: name, dtype: object

In [None]:
data.tags.head().apply(remove_parenteses)
data1 = data['tags'].str.extract('\[(.*?)\]', expand=False)
data1

0         '60-minutes-or-less', 'time-to-make', 'course'...
1         '30-minutes-or-less', 'time-to-make', 'course'...
2         'time-to-make', 'course', 'preparation', 'main...
3         '60-minutes-or-less', 'time-to-make', 'course'...
4         'weeknight', 'time-to-make', 'course', 'main-i...
                                ...                        
231632    'ham', '60-minutes-or-less', 'time-to-make', '...
231633    '15-minutes-or-less', 'time-to-make', 'course'...
231634    '60-minutes-or-less', 'time-to-make', 'course'...
231635    '30-minutes-or-less', 'time-to-make', 'course'...
231636    '30-minutes-or-less', 'time-to-make', 'course'...
Name: tags, Length: 226657, dtype: object

In [None]:
data.steps.head().apply(remove_parenteses)
data1 = data['steps'].str.extract('\[(.*?)\]', expand=False)
data1

0         'make a choice and proceed with recipe', 'depe...
1         'preheat oven to 425 degrees f', 'press dough ...
2         'brown ground beef in large pot', 'add chopped...
3         'place potatoes in a large pot of lightly salt...
4         'mix all ingredients& boil for 2 1 / 2 hours ,...
                                ...                        
231632    'heat oil in a 4-quart dutch oven', 'add celer...
231633            'mix all ingredients together thoroughly'
231634    'in a bowl , combine the mashed yolks and mayo...
231635    'place melted butter in a large mixing bowl an...
231636    'whip sugar and shortening in a large bowl , a...
Name: steps, Length: 226657, dtype: object

In [None]:
data.description.head().apply(remove_parenteses)

0    autumn is my favorite time of year to cook! th...
1    this recipe calls for the crust to be prebaked...
2    this modified version of 'mom's' chili was a h...
3    this is a super easy, great tasting, make ahea...
4    my dh's amish mother raised him on this recipe...
Name: description, dtype: object

# Modify type

In [None]:
data.id=data.id.astype('int32')
data.minutes=data.minutes.astype('int32')
data.n_steps=data.n_steps.astype('int32')
data.n_ingredients=data.n_ingredients.astype('int32')

# General dataset information

In [None]:
data.head()

Unnamed: 0,name,id,minutes,submitted_recipe,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226657 entries, 0 to 231636
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   name              226657 non-null  object
 1   id                226657 non-null  int32 
 2   minutes           226657 non-null  int32 
 3   submitted_recipe  226657 non-null  object
 4   tags              226657 non-null  object
 5   nutrition         226657 non-null  object
 6   n_steps           226657 non-null  int32 
 7   steps             226657 non-null  object
 8   description       226657 non-null  object
 9   ingredients       226657 non-null  object
 10  n_ingredients     226657 non-null  int32 
dtypes: int32(4), object(7)
memory usage: 17.3+ MB
