## *Importing required libraries*

In [1]:
import pandas as pd
import gensim

## *Merged 2 excel sheets datasets, in total we have used 329 asanas benefits*

In [2]:
df = pd.read_csv('sample_data/test.csv')

In [3]:
df

Unnamed: 0,Asana,Benefits
0,PADOTTHANASANA,This asana strengthens\nthe abdominal muscles ...
1,PARVATASANA,This pose strengthens the nerves and muscles i...
2,ARDHA TITALI ASANA,This is an excellent \npreparatory practice fo...
3,GATYATMAK MERU \nVAKRASANA,This asana removes stiffness \nof the back and...
4,SIDEWAYS VIEWING,Sideways viewing relaxes the \ntension of the ...
...,...,...
321,kapilasana,"1. Helps in stretching legs, hamstrings, arms,..."
322,omkarasana,It relieves the pain of hands and feet( It is ...
323,kashyapawsana,1. This asana provides a deep muscular massage...
324,bhunamanasana,1. Bhunamanasana stretches and improves the fl...


In [4]:
#shape of the data frame
df.shape

(326, 2)

## *Removing Stop words at first and converting it into lower cases so that all stop words can be remove which were in Upper cases.*


In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
#  1st step for removing stop words is to use the library and remove stop words, this will make
# sure that some basic stop words and numerical values is been removed from the Benefits column.

# 2nd step is that there might be stop words present in upper case for example "This" is a stop words
# which was not removed in 1st step so I converted the Benefits column to lower case. Moreover we can only
# lowercase any sentence or entire column when that column is free of any numerical value.

# 3rd then further removing stopwords, in this way we get the whole benefits column free from stop words


from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_multiple_whitespaces
df['Benefits'] = df['Benefits'].apply(str.lower)
df['Benefits']=df.Benefits.apply(remove_stopwords)
df['Benefits']=df.Benefits.apply(strip_non_alphanum)
df['Benefits']=df.Benefits.apply(strip_numeric)
df['Benefits']=df.Benefits.apply(strip_multiple_whitespaces)
df['Asana']=df.Asana.apply(strip_multiple_whitespaces)
df['Asana'] = df['Asana'].apply(str.lower)
df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Asana,Benefits
0,padotthanasana,asana strengthens abdominal muscles massages o...
1,parvatasana,pose strengthens nerves muscles limbs back hel...
2,ardha titali asana,excellent preparatory practice loosening knee ...
3,gatyatmak meru vakrasana,asana removes stiffness increases flexibility ...
4,sideways viewing,sideways viewing relaxes tension muscles strai...
...,...,...
321,kapilasana,helps stretching legs hamstrings arms chest b...
322,omkarasana,relieves pain hands feet helpful rheumatism gi...
323,kashyapawsana,asana provides deep muscular massage abdomen ...
324,bhunamanasana,bhunamanasana stretches improves flexibility ...


## *We have further removed special characters and tokenized each row of benefits.*

In [6]:
# this will create the list of each benefit rows
benefits =df['Benefits'].apply(gensim.utils.simple_preprocess)
print(benefits)

0      [asana, strengthens, abdominal, muscles, massa...
1      [pose, strengthens, nerves, muscles, limbs, ba...
2      [excellent, preparatory, practice, loosening, ...
3      [asana, removes, stiffness, increases, flexibi...
4      [sideways, viewing, relaxes, tension, muscles,...
                             ...                        
321    [helps, stretching, legs, hamstrings, arms, ch...
322    [relieves, pain, hands, feet, helpful, rheumat...
323    [asana, provides, deep, muscular, massage, abd...
324    [bhunamanasana, stretches, improves, flexibili...
325    [stretches, strengthens, lengthens, mandalasan...
Name: Benefits, Length: 326, dtype: object


In [7]:
# prints the words present in the benefit row 1
benefits[0]

['asana',
 'strengthens',
 'abdominal',
 'muscles',
 'massages',
 'organs',
 'strengthens',
 'digestive',
 'system',
 'lower',
 'back',
 'pelvic',
 'perineal',
 'muscles',
 'helps',
 'correct',
 'prolapse']

## *Implemented Word2Vec with following features:*
* window size = 5
* minimum word cound for which the model can input row wise data = 2
* required cpu  threads to train the model = 4
* size of the required vector embedding = 50

In [10]:
model = gensim.models.Word2Vec(
    window=5,
    min_count=2,
    workers=4,
    vector_size=50,
)


## *Building vocabulary of unique words present in the entire benefit column*

In [11]:

model.build_vocab(benefits, progress_per=5)
# vocab_len = len(model.wv)
# print(vocab_len)

In [12]:
model.train(benefits, total_examples=model.corpus_count, epochs=2000)

(22225305, 29294000)

## *We can test our model for any words suppose say sciatica we will get the similar words in benefits*

In [13]:
#when we find the similar words for a disease in
#Benefits section we also see that we get the names of asanas also.
#like for this word we get similar words like chakrasana, marjariasana, shalabhasana which indicates
#that since we have taken these words from benefits section, it means that for this particular diseaes
#most prpbable asanas can be chakrasana, marjariasana, shalabhasana etc.  to cure it.
model.wv.most_similar("sciatica", topn= 100)

#print(type(model.wv.most_similar("sciatica", topn= 100)))

[('stiff', 0.43644189834594727),
 ('mild', 0.4100874364376068),
 ('pains', 0.3962039649486542),
 ('spondylitis', 0.38906794786453247),
 ('time', 0.37931740283966064),
 ('backache', 0.37880444526672363),
 ('benefit', 0.3751801550388336),
 ('rid', 0.36199551820755005),
 ('sitting', 0.340221643447876),
 ('treatment', 0.3363928198814392),
 ('problem', 0.3257470726966858),
 ('slipped', 0.32412511110305786),
 ('stress', 0.31095775961875916),
 ('on', 0.30969634652137756),
 ('marjariasana', 0.3091569244861603),
 ('relief', 0.3007253408432007),
 ('squat', 0.2997783422470093),
 ('advantage', 0.29869696497917175),
 ('sciatic', 0.2981896996498108),
 ('preparation', 0.2941749095916748),
 ('utkatasana', 0.29357436299324036),
 ('vajra', 0.29219236969947815),
 ('spinal', 0.2903304696083069),
 ('region', 0.28609660267829895),
 ('easily', 0.27754706144332886),
 ('poorna', 0.27749988436698914),
 ('disc', 0.27151963114738464),
 ('gently', 0.26491379737854004),
 ('lumbago', 0.2642703056335449),
 ('inside',

## *Printing one of the benefits column word's vector*

In [14]:
# suppose we want to know the vector embedding of a word 'pain' so we have printed the vector embedding of size 50
import numpy as np
print((model.wv.get_vector('pain')))

[-11.364991     3.1790798    2.8793757    2.7907574   -1.887808
   3.6024482   -2.335147    -4.433374     8.858542   -10.346805
   1.3788034   -4.434423     2.136297    -3.044669   -10.289217
  -2.6439059    7.0858006   -5.6713257    0.02115754  -7.4610877
  -7.259774    -2.5884888   -7.0183964   -1.4791648   -0.13604563
  -0.1646127    5.648745    -2.157266     4.09302     -0.51077527
  -4.9723096    3.2448938   -4.2145514   -4.772297    -0.45920232
   4.762087     2.8480294    6.9159274    7.2459917    7.5298457
   5.600863     1.4329433    3.730052    -0.4152324    0.41674584
   3.874206    -3.5041971   -1.4424531   -1.776113    -3.1058195 ]


In [15]:
# counts total number of rows in datasets being trained
model.corpus_count

326

In [17]:
from gensim.models import Word2Vec
# created list of unique words from the column Benefits.
words = list(model.wv.key_to_index.keys())


In [18]:
# printing unique words from the vocabulary list.
print(words)

['muscles', 'body', 'pose', 'helps', 'organs', 'asana', 'spine', 'improves', 'abdominal', 'hips', 'shoulders', 'strengthens', 'blood', 'practice', 'lower', 'balance', 'stretches', 'stretch', 'tones', 'chest', 'legs', 'flexibility', 'neck', 'good', 'arms', 'abdomen', 'posture', 'mind', 'nervous', 'back', 'flow', 'yoga', 'circulation', 'reproductive', 'pelvic', 'system', 'leg', 'hip', 'entire', 'stimulates', 'awareness', 'nerves', 'spinal', 'core', 'digestive', 'deep', 'improving', 'gives', 'heart', 'joints', 'pressure', 'digestion', 'chakra', 'benefits', 'increases', 'great', 'concentration', 'strength', 'upper', 'energy', 'constipation', 'sense', 'thighs', 'like', 'functioning', 'related', 'glands', 'region', 'toning', 'breathing', 'stretching', 'knees', 'especially', 'poses', 'toned', 'breath', 'ankles', 'hamstrings', 'area', 'massages', 'liver', 'internal', 'strong', 'focus', 'help', 'better', 'the', 'improve', 'pain', 'disorders', 'alignment', 'brain', 'stability', 'feet', 'flexible

In [19]:
# here we have the length of unique words

print(len(words))

1330


## *Created an empty dictionary at first to store the unique words as key words  along with its vector embeddings.*

In [20]:
dict_of_word_embeddings = dict({})
for i in words:
    dict_of_word_embeddings[i] = model.wv[i]

In [21]:
# on printing the dictionary, we get the following result.
print(dict_of_word_embeddings)

{'muscles': array([-2.5789182 ,  2.1639256 ,  0.28219488,  3.0015676 ,  0.0945513 ,
       -3.8629088 , -0.61405414,  0.26608396, -2.1694064 ,  0.5845034 ,
        1.8079447 ,  3.4841342 ,  0.00791073, -1.6008021 , -1.9591068 ,
        1.3356478 ,  3.171194  , -1.6937765 , -1.1308097 ,  1.8560693 ,
       -1.1408123 , -3.062013  ,  1.3373936 , -2.8183753 , -0.10887559,
        0.3465332 , -1.2267175 ,  1.8738321 ,  0.93297446,  0.6567295 ,
        0.8179274 , -1.5668067 , -1.7023661 ,  1.026951  , -1.6156228 ,
        2.555431  , -1.2150404 , -0.60990375, -0.5655717 , -0.5450255 ,
       -0.15024161, -1.3806183 , -1.1431798 , -0.56688815,  1.2075454 ,
        0.00492786,  1.8275678 , -2.0103154 ,  1.7549001 ,  1.360821  ],
      dtype=float32), 'body': array([-1.2291402 ,  1.3039935 , -1.4131582 ,  2.2873492 ,  2.3666086 ,
       -1.3320062 ,  1.5078304 ,  0.21080814,  2.4968584 ,  2.4263153 ,
        2.3427143 ,  0.14485592, -1.2415527 , -0.39936253, -0.67893124,
        0.5419228 ,  

## *Exporting dictionary data into separate excel sheet/ csv file*.

In [23]:
Unique_words = dict_of_word_embeddings.keys()
word_vectors  = dict_of_word_embeddings.values()
# print(asanas, word_vectors)
d = {'Unique_words' : Unique_words , 'Word_Vectors' : word_vectors}
dataframe = pd.DataFrame(data = d)
dataframe

Unnamed: 0,Unique_words,Word_Vectors
0,muscles,"[-2.5789182, 2.1639256, 0.28219488, 3.0015676,..."
1,body,"[-1.2291402, 1.3039935, -1.4131582, 2.2873492,..."
2,pose,"[-1.3233534, -0.13491178, -0.56944686, 1.32691..."
3,helps,"[0.79891104, -2.0545766, -0.23233588, 1.353395..."
4,organs,"[-2.9140625, -1.8864996, -2.3440092, 3.5934563..."
...,...,...
1325,butterfly,"[2.6314912, -0.2144185, 5.6519585, 0.6529799, ..."
1326,titli,"[-1.6047258, -0.7224632, 3.4799392, 2.44357, 4..."
1327,tolasana,"[3.7111309, -1.6483557, -6.518002, 2.800643, 0..."
1328,improvement,"[5.0909944, -0.22365838, 2.4058275, -2.2519302..."


## *We have made list of unique asanas as after merging excel sheets there were repeated asanas*

In [24]:
asanas = list(df['Asana'])
# total asanas present ( with repetition)
print(len(asanas))
asana = []


for x in asanas:
  if x not in asana:
    asana.append(x)
# total number of unique asanas
print(len(asana))
# list of unique asanas
print(asana)

326
293
['padotthanasana', 'parvatasana', 'ardha titali asana', 'gatyatmak meru vakrasana', 'sideways viewing', 'makarasana', 'padmasana', 'vajrasana', 'ardha chandrasana', 'yogamudrasana', 'bhujangasana', 'saithalyasana', 'bhu namanasana', 'sarvangasana', 'natarajasana', 'poorna bhujangasana', 'koormasana', 'poorna shalabhasana', 'poorna dhanurasana', 'bandha hasta utthanasana ', 'shava udarakarshanasana ', 'chakki chalanasana ', 'kashtha takshanasana ', 'vayu nishkasana', 'ushtrasana', 'samakonasana ', 'matsyasana', 'kandharasana', ' setu asana ', 'paschimottanasana', 'meru akarshanasana', 'pada hastasana', 'seetkari pranayama', 'jalandhara bandha', 'tadagi mudra', 'maha vedha mudra', 'shashankasana', 'janu chakra', 'poorna titali asana', 'manibandha chakra', 'skandha chakra', 'greeva sanchalana', 'padachakrasana', 'pada sanchalanasana', 'supta pawanmuktasana', 'jhulana lurhakanasana', 'supta udarakarshanasana', 'naukasana', 'rajju karshanasana', 'nauka sanchalanasana', 'namaskarasan

## *On-hot Encodded the asanas words and created a dictionary to store the asana word with its word embedding*

In [26]:
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(asana)
#print(integer_encoded)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)

### One hot encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

# onehot_encoded

asan_dict={}

for i in range(len(asana)):
  asan_dict[asana[i]] = onehot_encoded[i]

print(asan_dict)


{'padotthanasana': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
      

In [27]:
from tensorflow import keras
from tensorflow.keras.layers import Dense

In [28]:
# printing the benefit words of one row.
print(benefits[0])

['asana', 'strengthens', 'abdominal', 'muscles', 'massages', 'organs', 'strengthens', 'digestive', 'system', 'lower', 'back', 'pelvic', 'perineal', 'muscles', 'helps', 'correct', 'prolapse']


## *Created a Tupple associating the benefits words of each row with the respective asana. In order to bring the associativity between benefits words and asana words*

In [29]:
pair=[]

i=0
a=len(asana)
for x in benefits:
  if(i<a):
    target=asana[i]
    for y in x:
      if(y not in words):
        continue
      pair.append((y,target))
  i+=1
print(pair)




[('asana', 'padotthanasana'), ('strengthens', 'padotthanasana'), ('abdominal', 'padotthanasana'), ('muscles', 'padotthanasana'), ('massages', 'padotthanasana'), ('organs', 'padotthanasana'), ('strengthens', 'padotthanasana'), ('digestive', 'padotthanasana'), ('system', 'padotthanasana'), ('lower', 'padotthanasana'), ('back', 'padotthanasana'), ('pelvic', 'padotthanasana'), ('muscles', 'padotthanasana'), ('helps', 'padotthanasana'), ('correct', 'padotthanasana'), ('prolapse', 'padotthanasana'), ('pose', 'parvatasana'), ('strengthens', 'parvatasana'), ('nerves', 'parvatasana'), ('muscles', 'parvatasana'), ('limbs', 'parvatasana'), ('back', 'parvatasana'), ('helps', 'parvatasana'), ('increase', 'parvatasana'), ('height', 'parvatasana'), ('stretching', 'parvatasana'), ('muscles', 'parvatasana'), ('ligaments', 'parvatasana'), ('enabling', 'parvatasana'), ('growing', 'parvatasana'), ('bones', 'parvatasana'), ('grow', 'parvatasana'), ('longer', 'parvatasana'), ('circulation', 'parvatasana'), 

## *Making a 2-D array of context words(benefit words) and Target words(asana words) by numpy stack*

In [30]:
contexts=[dict_of_word_embeddings[context] for context,target in pair]
contexts=np.vstack(contexts)
# shape of the context words matrix
contexts.shape


(12085, 50)

In [31]:
targets=[asan_dict[target] for context,target in pair]
targets=np.vstack(targets)
# shape of the target words matrix

targets.shape

(12085, 293)

## *Implemented **Artificial Neural Network**. with the help of tensorfow's keras functional API with the following features:*

* Number of input layer = 1
* Size of input layer = 50 unit
* Number of dense layer = 1
* size of dense layer = 1000 units
* Number of output layer = 1
* size of output unit = 293
* activation function = softmax
* loss = categorical_crossentropy
* optimizer = adam
* number of epotchs = 100

In [33]:
from tensorflow import keras
from tensorflow.keras.layers import Dense

network_input = keras.Input(shape=(contexts.shape[1],), name='input_layer')
                                                                                    # Create a hidden layer for the network; store under 'hidden_layer'
hidden_layer1 = Dense(units=1000, activation='sigmoid', name='hidden_layer1')(network_input)

                                                                                            # Create an output layer for the network; store under 'output_layer'
output_layer = Dense(units=targets.shape[1], activation='softmax', name='output_layer')(hidden_layer1)

                                                                          # Create a Keras Model; store under 'embedding_model'
embedding_model = keras.Model(inputs=network_input, outputs=output_layer)

                                                          # Compile the model for training; define loss function
embedding_model.compile(loss='categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy'])

                                                          # Print out a summary of the model
embedding_model.summary()

In [34]:
# Fit a model to the data
embedding_model.fit(x=contexts,   # inputs
                    y=targets,   # outputs
                    batch_size=1024,  # how many pairs of words processed simultaneously
                    epochs=100,   # how many times we loop through the whole data
                    verbose=1   # do not print training status
                   )



Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 76ms/step - accuracy: 0.0124 - loss: 5.5624
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0328 - loss: 5.1751  
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0577 - loss: 4.9633 
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0709 - loss: 4.8017 
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0885 - loss: 4.6532 
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0922 - loss: 4.5442 
Epoch 7/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1002 - loss: 4.4542 
Epoch 8/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1059 - loss: 4.3721 
Epoch 9/100
[1m12/12[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x792d15c05410>

## *Function which inputs the user details and suggests user with the most recommended asanas*

In [35]:
from collections import Counter
from IPython.display import clear_output

def magic():
  predicted_asanas = []
  user_input_words= []
  final_predicted_asanas = []
  number_in_words = ['first','second', 'third', 'fourth']
  for i in range(4):
    user_input_words.append(input(f"Enter {number_in_words[i]} benefit word:  "))
  for i in user_input_words:
    if i in dict_of_word_embeddings:

      input_array = np.expand_dims(dict_of_word_embeddings[i], axis=0)
      prediction = embedding_model.predict(input_array)
      flatten_pred = prediction.flatten()
      result_indices = flatten_pred.argsort()[-10:][::-1]

      for result in result_indices:
        predicted_asanas.append(asana[result])


  counter_found = Counter(predicted_asanas)
  final_predicted_asanas_with_freq = counter_found.most_common(7)
  # print(final_predicted_asanas_with_freq)

  for yoga, freq in final_predicted_asanas_with_freq:
    final_predicted_asanas.append(yoga)

  print(final_predicted_asanas)
  choice=input("Clear output: Y/N ")
  if choice=='Y':
   clear_output()


magic()

Enter first benefit word:  sciatica
Enter second benefit word:  pain
Enter third benefit word:  back
Enter fourth benefit word:  strengthen
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
['parivritti janu sirshasana', ' kapali asana', 'trataka', 'mandalasana', 'tiryak tadasana', 'tolangulasana', 'parivritta parsvakonasana']


KeyboardInterrupt: Interrupted by user