## Load data from the MySQL database

In [1]:
# Dependencies for accessing MySQL database
from sqlalchemy import create_engine
import pymysql
from config import password

# Dependencies for data analyses and dataframe building
import pandas as pd
import numpy as np

# Dependencies for creating co-occurrence matrices
import itertools
import nltk
from nltk import bigrams

# Dependencies for visualising co-occurrence matrices
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Open the database connection
engine = create_engine(f"mysql+pymysql://root:{password}@localhost/dishes_db")
db_conn = engine.connect()

# Create a dataframe based on a query for the GSR embedding of the 
# dishes and occasions by state
df = pd.read_sql("select * from expert_elicitation limit 300", db_conn)

# Close the connection
db_conn.close()

In [3]:
# Print the dataframe
# Dish1 = original value from expert elicitation
# Dish2 = equivalent name (to reduce sparsity of dish name values)
df.head()

Unnamed: 0,Id,State,Occasion,Dish,Dish2
0,1,Odisha,Breakfast,Bara,Bara
1,2,Odisha,Breakfast,Bread Sandwich,Sandwich
2,3,Odisha,Breakfast,Chakuli,Chakuli
3,4,Odisha,Breakfast,Chatua,Chatua
4,5,Odisha,Breakfast,Chole/Guguni,Chole


## Explore the dataset

In [4]:
def dataframe(state):
    x = df.loc[df["State"] == state].drop_duplicates()
    return x

In [5]:
# Create individual dataframes by state (and put them in a list)
states = ["Odisha", "West Bengal"]
df_state = [dataframe(state) for state in states]  

# Print number of unique dishes (after reducing sparsity)
for x in range(0, len(states)):
    print(f"There are {len(df_state[x].Dish2.unique())} unique dishes in {states[x]}.")

There are 74 unique dishes in Odisha.
There are 81 unique dishes in West Bengal.


In [6]:
# Count how many times a dish is mentioned per state
dish_grouped = df.groupby(["State", "Dish2"])["Dish2"].count().to_frame("Count")\
               .reset_index()
dish_grouped.sort_values(["State","Count"], ascending = False).head()

Unnamed: 0,State,Dish2,Count
109,West Bengal,Fried vegetables,3
116,West Bengal,Khichdi,3
117,West Bengal,Luchi,3
138,West Bengal,Raita,3
143,West Bengal,Roti,3


In [7]:
# Count how many times a dish is mentioned in both states
dish_grouped2 = df.groupby("Dish2")["Dish2"].count().to_frame("Count").reset_index()
dish_grouped2.sort_values("Count", ascending = False).head()

Unnamed: 0,Dish2,Count
96,Roti,6
89,Raita,6
61,Khichdi,5
23,Chicken curry,5
52,Fried vegetables,5


In [8]:
# Common dishes between states (occasions differ)
dishes_common = pd.merge(df_state[0][["State", "Occasion", "Dish2"]],
                         df_state[1][["State", "Occasion", "Dish2"]],
                         on = "Dish2").drop_duplicates()
dishes_common.head()

Unnamed: 0,State_x,Occasion_x,Dish2,State_y,Occasion_y
0,Odisha,Breakfast,Sandwich,West Bengal,PM Snack
1,Odisha,Breakfast,Chakuli,West Bengal,PM Snack
2,Odisha,PM Snack,Chakuli,West Bengal,PM Snack
3,Odisha,Dinner,Chakuli,West Bengal,PM Snack
4,Odisha,Breakfast,Chatua,West Bengal,Breakfast


In [9]:
# What are the common dishes between states
dishes_common["Dish2"].unique()

array(['Sandwich', 'Chakuli', 'Chatua', 'Corn flakes', 'Idli', 'Paratha',
       'Madhi chakta', 'Puri', 'Roti', 'Suji halwa', 'Biscuit', 'Fruit',
       'Roasted groundnut', 'Chicken curry', 'Dal', 'Dalma', 'Egg curry',
       'Fish curry', 'Khichdi', 'Sanjula', 'Mutton curry', 'Pampad',
       'Paneer curry', 'Rice', 'Mashed potato', 'Prawn curry', 'Raita',
       'Fried vegetables', 'Panipuri', 'Moori masala', 'Mudhi khira',
       'Noodles', 'Chuda bhaja', 'Chutney', 'Rice kheer', 'Rasgola',
       'Guguni'], dtype=object)

## Create co-occurrence matrices

In [10]:
# Create a list of dictionaries of dish lists by occasion 
# 0 = Odisha; 1 = West Bengal
dicts_state = [df_state[x].groupby("Occasion")["Dish2"].apply(list).to_dict()\
              for x in range(0, len(df_state))]

# Create a list of dish lists
dish_lists = [[dicts_state[x][y] for y in dicts_state[x]] \
              for x in range(0, len(dicts_state))]

In [11]:
def coo_matrix(corpus):
    """ Create a co-occurrence matrix """
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}
    
    # Create bigrams
    bi_grams = list(bigrams(corpus))
    
    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
    
    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
    
    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
 
    # return the matrix and the index
    return co_occurrence_matrix, vocab_index

In [12]:
# Create a list of co-occurrence matrices
coo_matrices = []
for x in range(0, len(states)):
    data = list(itertools.chain.from_iterable(dish_lists[x])) # flat list of dishes
    matrix, vocab_index = coo_matrix(data)
    data_matrix = pd.DataFrame(matrix, index=vocab_index, columns=vocab_index)
    
    # Save the co-occurrence matrix as csv
    data_matrix.to_csv(f"coo-matrix_{states[x]}.csv", header = True)
    
    # Append co-occurrence matrix to list   
    coo_matrices.append(data_matrix)

In [13]:
# Preview one of the co-occurrence matrices
coo_matrices[0]

Unnamed: 0,Sambar,Ghuguri,Paratha,Madhi chakta,Roasted groundnut,Fried rice,Aloo dum,Bhature,Vegetable pakoda,Mushroom curry,...,Crab curry,Chatua,Palak paneer,Pakhala,Chuda upma,Biscuit,Egg curry,Moori masala,Rice pitha,Ragi porridge
Sambar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ghuguri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Paratha,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Madhi chakta,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Roasted groundnut,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fried rice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aloo dum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Bhature,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Vegetable pakoda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mushroom curry,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
