Here, we will be building a DANN.

In [91]:
#importing modules
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, Model
import time
import matplotlib.pyplot as plt
import numpy as np
import keras as K
from tensorflow import keras
from sklearn.model_selection import train_test_split
from keras.layers import Dense


In [92]:

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

Preprocessing Source Domain Datasets

In [93]:
#looking into data and its columns
import pandas as pd

# Replace 'your_file_path.csv' with the actual path to your CSV file
file_path = 'datasets/source/kaggle_food.csv'

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to check the data
print(df.head())


   Unnamed: 0                                              Title  \
0           0  Miso-Butter Roast Chicken With Acorn Squash Pa...   
1           1                    Crispy Salt and Pepper Potatoes   
2           2                        Thanksgiving Mac and Cheese   
3           3                 Italian Sausage and Bread Stuffing   
4           4                                       Newton's Law   

                                         Ingredients  \
0  ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...   
1  ['2 large egg whites', '1 pound new potatoes (...   
2  ['1 cup evaporated milk', '1 cup whole milk', ...   
3  ['1 (¾- to 1-pound) round Italian loaf, cut in...   
4  ['1 teaspoon dark brown sugar', '1 teaspoon ho...   

                                        Instructions  \
0  Pat chicken dry with paper towels, season all ...   
1  Preheat oven to 400°F and line a rimmed baking...   
2  Place a rack in middle of oven; preheat to 400...   
3  Preheat oven to 350°F with 

In [94]:
#here we will only be using 'Title' and 'Ingredients' for our purpose
#so we will be dropping the remaining columns
df = df.drop(columns=['Unnamed: 0', 'Ingredients', 'Image_Name',
       'Cleaned_Ingredients'])

In [95]:
#here we are creating a fake labels for temporary test
#1/3 part of the labels to 'appetizers'. another 1/3 part to 'dinner' and the last 1/3 part to 'desserts'

total_size = len(df)
category_size = total_size // 3

df.loc[:category_size - 1, 'Title'] = 'Appetizers'
df.loc[category_size:2*category_size - 1, 'Title'] = 'Dinner'
df.loc[2*category_size:total_size - 1, 'Title'] = 'Desserts'

In [96]:
#shuffling the dataframe
df = df.sample(frac=1, random_state=42)

# Assuming df['Instructions'] is your text data
df['Instructions'].fillna('', inplace=True)  # Replace NaN values with an empty string


In [97]:
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#defining the parameters
num_classes = 1
embedding_dim = 100

# Assuming df['Instructions'] is your text data, we  tokenize the input dataset into tokens for the CNN model
#Tokenizer Initialization and Fitting:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Instructions'])

#Vocabulary Size and Maximum Sequence Length Calculation:
vocab_size = len(tokenizer.word_index) + 1
max_sequence_length = max(df['Instructions'].apply(lambda x: len(x.split())))

#Texts to Sequences:
sequences = tokenizer.texts_to_sequences(df['Instructions'])

#Padding Sequences
data = pad_sequences(sequences, maxlen=max_sequence_length)


In [98]:
# Assuming 'text' is your input data and 'label' is your target variable
X = data   #data
y = df['Title'].values           #labels 

# Convert labels to numerical format using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train_source, X_test_source, y_train_source, y_test_source = train_test_split(X, y, test_size=0.2, random_state=42)


Preprocessing Target Domain Dataset

In [99]:
# Replace 'your_file_path.csv' with the actual path to your CSV file
file_path = 'datasets/target/recipes_serp_youtube_data.csv'

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to check the data
print(df.head())

             q                         queryTime  rank  \
0  dish recipe  2019-03-30 02:55:42.169989+00:00     1   
1  dish recipe  2019-03-30 02:55:42.169989+00:00     2   
2  dish recipe  2019-03-30 02:55:42.169989+00:00     3   
3  dish recipe  2019-03-30 02:55:42.169989+00:00     4   
4  dish recipe  2019-03-30 02:55:42.169989+00:00     5   

                                               title  \
0                             7 Easy Chicken Dinners   
1  KING of VEGETABLE Recipe | SAMBAR Recipe with ...   
2  ALOO Manchurian Recipe How to make aloo manchu...   
3  सूजी का इतना टेस्टी और आसान नाश्ता की आप रोज़ ...   
4  5 मिनट में बनाये कुरकुरे आलू स्नैक्स | Aloo Sn...   

                                         description  \
0  Customize & buy the Tasty Cookbook here: http:...   
1  Today we cooking one of the most popular veg r...   
2  ldli Manchurian Recipe https://youtu.be/wx7Mx9...   
3  Hello Friend's ….Aaj Main Aapko Quick & Easy B...   
4  If you liked the video give it 

In [100]:
df.columns

Index(['q', 'queryTime', 'rank', 'title', 'description', 'publishedAt',
       'channelTitle', 'totalResults', 'kind', 'channelId', 'default.height',
       'default.url', 'default.width', 'high.height', 'high.url', 'high.width',
       'liveBroadcastContent', 'medium.height', 'medium.url', 'medium.width',
       'nextPageToken', 'playlistId', 'resultsPerPage', 'thumbnails',
       'videoId', 'video.contentDetails', 'video.etag', 'video.id',
       'video.kind', 'video.localizations', 'video.player',
       'video.recordingDetails', 'video.snippet', 'video.statistics',
       'video.status', 'video.topicDetails', 'video.categoryId',
       'video.channelId', 'video.channelTitle', 'video.defaultAudioLanguage',
       'video.defaultLanguage', 'video.description',
       'video.liveBroadcastContent', 'video.localized', 'video.publishedAt',
       'video.tags', 'video.thumbnails', 'video.title',
       'video.relevantTopicIds', 'video.topicCategories', 'video.topicIds',
       'video.comme

In [101]:
#here we will only be using 'title' and 'description' for our purpose
#so we will be dropping the remaining columns
df = df.drop(columns=['q', 'queryTime', 'rank', 'publishedAt', 'channelTitle', 'totalResults', 'kind', 'channelId', 'default.height', 'default.url', 'default.width', 'high.height', 'high.url', 'high.width', 'liveBroadcastContent', 'medium.height', 'medium.url', 'medium.width', 'nextPageToken', 'playlistId', 'resultsPerPage', 'thumbnails', 'videoId', 'video.contentDetails', 'video.etag', 'video.id', 'video.kind', 'video.localizations', 'video.player', 'video.recordingDetails', 'video.snippet', 'video.statistics', 'video.status', 'video.topicDetails', 'video.categoryId', 'video.channelId', 'video.channelTitle', 'video.defaultAudioLanguage', 'video.defaultLanguage', 'video.description', 'video.liveBroadcastContent', 'video.localized', 'video.publishedAt','video.tags', 'video.thumbnails', 'video.title', 'video.relevantTopicIds', 'video.topicCategories', 'video.topicIds', 'video.commentCount', 'video.dislikeCount', 'video.favoriteCount', 'video.likeCount', 'video.viewCount', 'video.embeddable', 'video.license', 'video.privacyStatus', 'video.publicStatsViewable', 'video.uploadStatus', 'video.caption', 'video.definition', 'video.dimension', 'video.duration', 'video.licensedContent', 'video.projection', 'video.regionRestriction', 'video.liveStreamingDetails', 'video.contentRating', 'channel.contentDetails', 'channel.etag', 'channel.id', 'channel.kind', 'channel.snippet', 'channel.statistics', 'channel.country', 'channel.customUrl', 'channel.defaultLanguage', 'channel.description', 'channel.localized', 'channel.publishedAt', 'channel.thumbnails', 'channel.title', 'channel.commentCount', 'channel.hiddenSubscriberCount', 'channel.subscriberCount', 'channel.videoCount', 'channel.viewCount', 'channel.relatedPlaylists'])

In [102]:
df.columns

Index(['title', 'description'], dtype='object')

In [103]:
# Assuming df['Instructions'] is your text data
df['description'].fillna('', inplace=True)  # Replace NaN values with an empty string


In [104]:
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#defining the parameters
num_classes = 1
embedding_dim = 100

# Assuming df['Instructions'] is your text data, we  tokenize the input dataset into tokens for the CNN model
#Tokenizer Initialization and Fitting:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['description'])

#Vocabulary Size and Maximum Sequence Length Calculation:
vocab_size = len(tokenizer.word_index) + 1
max_sequence_length = max(df['description'].apply(lambda x: len(x.split())))

#Texts to Sequences:
sequences = tokenizer.texts_to_sequences(df['description'])

#Padding Sequences
data = pad_sequences(sequences, maxlen=max_sequence_length)


In [105]:

# Assuming 'text' is your input data and 'label' is your target variable
X = data   #data
y = df['title'].values           #labels 

# Convert labels to numerical format using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train_target, X_test_target, y_train_target, y_test_target = train_test_split(X, y, test_size=0.2, random_state=42)


In [106]:
# Prepare domain labels
domain_label_source = np.zeros((X_train_source.shape[0], 1))  # Source domain label is 0
domain_label_target = np.ones((X_train_target.shape[0], 1))   # Target domain label is 1

In [107]:
# Concatenate source and target data
X_combined = np.concatenate([X_train_source, X_train_target], axis=0)
y_combined = np.concatenate([y_train_source, np.zeros((y_train_source.shape))], axis=0) # Labels for target domain can be ignored
domain_labels_combined = np.vstack((domain_label_source, domain_label_target))

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 2587 and the array at index 1 has size 49

In [None]:
# Define the Feature Extractor
def build_feature_extractor(vocab_size,embedding_dim,max_sequence_length):
    model = models.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
        layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
        layers.GlobalMaxPooling1D()
    ])
    return model

In [None]:
# Define the Label Predictor
def build_label_predictor(num_classes):
    model = models.Sequential([
        Dense(units=num_classes, activation='softmax',name='output')
    ])
    return model

In [None]:
# Define the Domain Predictor
def build_domain_predictor():
    model = models.Sequential([
        layers.Dense(64,activation='relu',name='dense_1'),
        layers.Dense(1,activation='sigmoid',name='dense_2')
    ])
    return model

In [None]:
# Build the complete DANN model
def build_dann(vocab_size,embedding_dim,max_sequence_length, num_classes):
    feature_extractor = build_feature_extractor(vocab_size,embedding_dim,max_sequence_length)
    label_predictor = build_label_predictor(num_classes)
    domain_predictor = build_domain_predictor()

    # Define inputs
    input_data = layers.Input(shape=(max_sequence_length,))
    label = layers.Input(shape=(num_classes,))
    domain_label = layers.Input(shape=(1,))
    
    # Feature extractor output
    feature_output = feature_extractor(input_data)
    
    # Label prediction branch
    label_output = label_predictor(feature_output)
    
    # Domain prediction branch
    domain_output = domain_predictor(feature_output)
    
    dann_model = models.Model(inputs=[input_data, label, domain_label], 
                              outputs=[label_output, domain_output])
    
    return dann_model

In [None]:
# Define loss functions
def label_loss(y_true, y_pred):
    return tf.keras.losses.categorical_crossentropy(y_true, y_pred)

def domain_loss(y_true, y_pred):
    return tf.keras.losses.binary_crossentropy(y_true, y_pred)

In [None]:
# Build and compile the DANN model
dann_model = build_dann(vocab_size,embedding_dim,max_sequence_length, num_classes)

dann_model.compile(optimizer='Adam',
                   loss=[label_loss, domain_loss],
                   loss_weights=[1.0,0.1])

In [None]:
# Train the DANN

start_time = time.time()

dann_model.fit([x_combined,y_combined,domain_labels_combined], 
               [x_combined,y_combined], 
               epochs=1)

end_time = time.time()

NameError: name 'x_combined' is not defined