## Feature extraction for training the svm model to identify a recipe

In [1]:
import pandas as pd

df = pd.read_csv('database/posts.csv')  
df.head(100)

Unnamed: 0,Post ID,Post Text
0,1,"c(""Soak saffron in warm milk for 5 minutes and..."
1,2,"c(""Into a 1 quart Jar with tight fitting lid, ..."
2,3,"c(""Drain the tofu, carefully squeezing out exc..."
3,4,"c(""Mix everything together and bring to a boil..."
4,5,"c(""Graham Cracker Crust: In small bowl, combin..."
...,...,...
95,96,3 Things Everyone Knows About POLITICAL That ...
96,97,10 Funny SCIENCE Quotes
97,98,Best Make POLITICS You Will Read This Year (i...
98,99,5 Incredibly Useful MATHEMATICS Tips For Smal...


### 1. Data Cleaning

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a new DataFrame for the cleaned post descriptions
df_clean = df[['Post Text']].copy()  
df_clean['Post Text'] = df_clean['Post Text'].str.lower()  # convert to lower case
df_clean['Post Text'] = df_clean['Post Text'].str.replace('[^\w\s]', '')  # remove punctuation

df_clean.head(100)

  df_clean['Post Text'] = df_clean['Post Text'].str.replace('[^\w\s]', '')  # remove punctuation


Unnamed: 0,Post Text
0,csoak saffron in warm milk for 5 minutes and p...
1,cinto a 1 quart jar with tight fitting lid put...
2,cdrain the tofu carefully squeezing out excess...
3,cmix everything together and bring to a boil r...
4,cgraham cracker crust in small bowl combine gr...
...,...
95,3 things everyone knows about political that ...
96,10 funny science quotes
97,best make politics you will read this year in...
98,5 incredibly useful mathematics tips for smal...


### 2. Feature Extraction - Word Count

In [3]:
# Define your list of cooking-related words
words = ['ingredients', 'oven', 'bake', 'cook', 'cooking', 'mix', 'boiling', 'boil', 'fry', 'grill', 'roast', 'stir', 'blend', 'chop', 'slice', 'dice', 'mince', 'marinade', 'spice']  

# Initialize a CountVectorizer
vectorizer = CountVectorizer(binary=False, vocabulary=words)

# Apply the CountVectorizer to the cleaned post descriptions
X = vectorizer.fit_transform(df_clean['Post Text'])

# Create a new DataFrame for the word counts
features_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add a new column to df_clean that contains the count of cooking-related words for each post
df_clean['Cooking Verbs'] = features_df.sum(axis=1)

df_clean.head(100)

Unnamed: 0,Post Text,Cooking Verbs
0,csoak saffron in warm milk for 5 minutes and p...,8
1,cinto a 1 quart jar with tight fitting lid put...,2
2,cdrain the tofu carefully squeezing out excess...,10
3,cmix everything together and bring to a boil r...,1
4,cgraham cracker crust in small bowl combine gr...,7
...,...,...
95,3 things everyone knows about political that ...,0
96,10 funny science quotes,0
97,best make politics you will read this year in...,0
98,5 incredibly useful mathematics tips for smal...,0


### 3. Feature Extraction - TF-IDF, Length, Number Usage, and Bigrams
   This section extracts several features from the text data:

   *TF-IDF scores for the cooking-related words

   *Length of each post
   
   *Presence of bigrams (pairs of consecutive words)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re

# Extract TF-IDF features
vectorizer = TfidfVectorizer(vocabulary=words)
X = vectorizer.fit_transform(df_clean['Post Text'])
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df_clean = pd.concat([df_clean, tfidf_df.add_prefix('tfidf_')], axis=1)

# Extract length feature
df_clean['length'] = df_clean['Post Text'].apply(len)

# Extract number usage feature
df_clean['num_numbers'] = df_clean['Post Text'].str.count('\d')

# Extract n-gram features (bigrams)
vectorizer = CountVectorizer(binary=True, ngram_range=(2, 2))
X = vectorizer.fit_transform(df_clean['Post Text'])
ngram_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df_clean = pd.concat([df_clean, ngram_df.add_prefix('bigram_')], axis=1)

# Sum all TF-IDF features into one column
df_clean['tfidf_sum'] = tfidf_df.sum(axis=1)

# Sum all cooking verb features into one column
cooking_verbs_cols = [col for col in df_clean.columns if col in words]
if cooking_verbs_cols:
    df_clean['cooking_verbs_sum'] = df_clean[cooking_verbs_cols].sum(axis=1)

# Sum all bigram features into one column
df_clean['bigram_sum'] = ngram_df.sum(axis=1)
# Add 'Post Length' column
df_clean['Post Length'] = df_clean['Post Text'].apply(len)



In [None]:
df_clean.head()

Unnamed: 0,Post Text,Cooking Verbs,tfidf_sum,bigram_sum,tfidf_ingredients,tfidf_oven,tfidf_bake,tfidf_cook,tfidf_cooking,tfidf_mix,...,bigram_your zoology,bigram_zest add,bigram_zoology is,bigram_zoology to,bigram_zucchini 12,bigram_zucchini egg,bigram_zucchini into,bigram_zucchini then,bigram_zucchiniwith this,Post Length
0,csoak saffron in warm milk for 5 minutes and p...,8,2.394449,172,0.000000,0.0,0.0,0.507365,0.0,0.496544,...,0,0,0,0,0,0,0,0,0,1050
1,cinto a 1 quart jar with tight fitting lid put...,2,1.413113,68,0.000000,0.0,0.0,0.000000,0.0,0.734450,...,0,1,0,0,0,0,0,0,0,364
2,cdrain the tofu carefully squeezing out excess...,10,2.049659,213,0.277353,0.0,0.0,0.000000,0.0,0.000000,...,0,0,0,0,0,0,1,1,0,1414
3,cmix everything together and bring to a boil r...,1,1.000000,30,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,180
4,cgraham cracker crust in small bowl combine gr...,7,1.319044,181,0.000000,0.0,0.0,0.404511,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,1307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3 things everyone knows about political that ...,0,0.000000,7,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,54
96,10 funny science quotes,0,0.000000,3,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,24
97,best make politics you will read this year in...,0,0.000000,9,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,51
98,5 incredibly useful mathematics tips for smal...,0,0.000000,6,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,58


In [10]:
# Subset the dataframe to only include the desired columns
df_clean = df_clean[['Post Text', 'Cooking Verbs', 'tfidf_sum', 'bigram_sum','Post Length']]

df_clean.head(100)

Unnamed: 0,Post Text,Cooking Verbs,tfidf_sum,bigram_sum,Post Length
0,csoak saffron in warm milk for 5 minutes and p...,8,2.394449,172,1050
1,cinto a 1 quart jar with tight fitting lid put...,2,1.413113,68,364
2,cdrain the tofu carefully squeezing out excess...,10,2.049659,213,1414
3,cmix everything together and bring to a boil r...,1,1.000000,30,180
4,cgraham cracker crust in small bowl combine gr...,7,1.319044,181,1307
...,...,...,...,...,...
95,3 things everyone knows about political that ...,0,0.000000,7,54
96,10 funny science quotes,0,0.000000,3,24
97,best make politics you will read this year in...,0,0.000000,9,51
98,5 incredibly useful mathematics tips for smal...,0,0.000000,6,58


In [12]:
df_clean['is_recipe'] = [1]*50 + [0]*50
df_clean.head(100)

Unnamed: 0,Post Text,Cooking Verbs,tfidf_sum,bigram_sum,Post Length,is_recipe
0,csoak saffron in warm milk for 5 minutes and p...,8,2.394449,172,1050,1
1,cinto a 1 quart jar with tight fitting lid put...,2,1.413113,68,364,1
2,cdrain the tofu carefully squeezing out excess...,10,2.049659,213,1414,1
3,cmix everything together and bring to a boil r...,1,1.000000,30,180,1
4,cgraham cracker crust in small bowl combine gr...,7,1.319044,181,1307,1
...,...,...,...,...,...,...
95,3 things everyone knows about political that ...,0,0.000000,7,54,0
96,10 funny science quotes,0,0.000000,3,24,0
97,best make politics you will read this year in...,0,0.000000,9,51,0
98,5 incredibly useful mathematics tips for smal...,0,0.000000,6,58,0


In [13]:
# Save df_clean to a CSV file
df_clean.to_csv('cleaned_posts.csv', index=False)


: 