## *Importing required libraries*

In [19]:
import pandas as pd
import gensim

## *Merged 2 excel sheets datasets, in total we have used 329 asanas benefits*

In [20]:
df = pd.read_csv('test.csv')

In [21]:
df

Unnamed: 0,Asana,Benefits
0,PADOTTHANASANA,This asana strengthens\nthe abdominal muscles ...
1,PARVATASANA,This pose strengthens the nerves and muscles i...
2,ARDHA TITALI ASANA,This is an excellent \npreparatory practice fo...
3,GATYATMAK MERU \nVAKRASANA,This asana removes stiffness \nof the back and...
4,SIDEWAYS VIEWING,Sideways viewing relaxes the \ntension of the ...
...,...,...
321,kapilasana,"1. Helps in stretching legs, hamstrings, arms,..."
322,omkarasana,It relieves the pain of hands and feet( It is ...
323,kashyapawsana,1. This asana provides a deep muscular massage...
324,bhunamanasana,1. Bhunamanasana stretches and improves the fl...


In [22]:
#shape of the data frame
df.shape

(326, 2)

## *Removing Stop words at first and converting it into lower cases so that all stop words can be remove which were in Upper cases.*


In [23]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
#  1st step for removing stop words is to use the library and remove stop words, this will make 
# sure that some basic stop words and numerical values is been removed from the Benefits column.

# 2nd step is that there might be stop words present in upper case for example "This" is a stop words 
# which was not removed in 1st step so I converted the Benefits column to lower case. Moreover we can only  
# lowercase any sentence or entire column when that column is free of any numerical value.

# 3rd then further removing stopwords, in this way we get the whole benefits column free from stop words


from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_multiple_whitespaces
df['Benefits'] = df['Benefits'].apply(str.lower)
df['Benefits']=df.Benefits.apply(remove_stopwords)
df['Benefits']=df.Benefits.apply(strip_non_alphanum)
df['Benefits']=df.Benefits.apply(strip_numeric)
df['Benefits']=df.Benefits.apply(strip_multiple_whitespaces)
df['Asana']=df.Asana.apply(strip_multiple_whitespaces)
df['Asana'] = df['Asana'].apply(str.lower)
df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Asana,Benefits
0,padotthanasana,asana strengthens abdominal muscles massages o...
1,parvatasana,pose strengthens nerves muscles limbs back hel...
2,ardha titali asana,excellent preparatory practice loosening knee ...
3,gatyatmak meru vakrasana,asana removes stiffness increases flexibility ...
4,sideways viewing,sideways viewing relaxes tension muscles strai...
...,...,...
321,kapilasana,helps stretching legs hamstrings arms chest b...
322,omkarasana,relieves pain hands feet helpful rheumatism gi...
323,kashyapawsana,asana provides deep muscular massage abdomen ...
324,bhunamanasana,bhunamanasana stretches improves flexibility ...


## *We have further removed special characters and tokenized each row of benefits.* 

In [24]:
# this will create the list of each benefit rows
benefits =df['Benefits'].apply(gensim.utils.simple_preprocess)
print(benefits)

0      [asana, strengthens, abdominal, muscles, massa...
1      [pose, strengthens, nerves, muscles, limbs, ba...
2      [excellent, preparatory, practice, loosening, ...
3      [asana, removes, stiffness, increases, flexibi...
4      [sideways, viewing, relaxes, tension, muscles,...
                             ...                        
321    [helps, stretching, legs, hamstrings, arms, ch...
322    [relieves, pain, hands, feet, helpful, rheumat...
323    [asana, provides, deep, muscular, massage, abd...
324    [bhunamanasana, stretches, improves, flexibili...
325    [stretches, strengthens, lengthens, mandalasan...
Name: Benefits, Length: 326, dtype: object


In [25]:
# prints the words present in the benefit row 1
benefits[0]

['asana',
 'strengthens',
 'abdominal',
 'muscles',
 'massages',
 'organs',
 'strengthens',
 'digestive',
 'system',
 'lower',
 'back',
 'pelvic',
 'perineal',
 'muscles',
 'helps',
 'correct',
 'prolapse']

## *Implemented Word2Vec with following features:*
* window size = 5
* minimum word cound for which the model can input row wise data = 2
* required cpu  threads to train the model = 4
* size of the required vector embedding = 50

In [26]:

model = gensim.models.Word2Vec(
    window=5,
    min_count=2,
    workers=4,
    size = 50,
)


## *Building vocabulary of unique words present in the entire benefit column*

In [27]:

model.build_vocab(benefits, progress_per=5)
# vocab_len = len(model.wv)
# print(vocab_len)

In [28]:
model.train(benefits, total_examples=model.corpus_count, epochs=2000)

(22225305, 29294000)

## *We can test our model for any words suppose say sciatica we will get the similar words in benefits* 

In [29]:
#when we find the similar words for a disease in 
#Benefits section we also see that we get the names of asanas also.
#like for this word we get similar words like chakrasana, marjariasana, shalabhasana which indicates
#that since we have taken these words from benefits section, it means that for this particular diseaes
#most prpbable asanas can be chakrasana, marjariasana, shalabhasana etc.  to cure it.
model.wv.most_similar("sciatica", topn= 100)

#print(type(model.wv.most_similar("sciatica", topn= 100)))

[('mild', 0.509691596031189),
 ('sciatic', 0.42016956210136414),
 ('stiff', 0.4091465175151825),
 ('therapeutically', 0.40894514322280884),
 ('time', 0.3828990161418915),
 ('lung', 0.37667182087898254),
 ('insomnia', 0.3739498257637024),
 ('lumbago', 0.37372151017189026),
 ('treatment', 0.35769927501678467),
 ('spinal', 0.35520556569099426),
 ('vajra', 0.35468417406082153),
 ('backache', 0.3535345196723938),
 ('sleep', 0.3492906093597412),
 ('discs', 0.348525732755661),
 ('passive', 0.3420049548149109),
 ('relaxes', 0.3387145400047302),
 ('disc', 0.3372121751308441),
 ('calves', 0.3275812268257141),
 ('spondylitis', 0.3229600787162781),
 ('asana', 0.31548911333084106),
 ('eyeball', 0.3148905336856842),
 ('intensity', 0.30996865034103394),
 ('corrects', 0.30729517340660095),
 ('heals', 0.3072410821914673),
 ('heels', 0.3067161440849304),
 ('relaxing', 0.2990413308143616),
 ('certain', 0.29722124338150024),
 ('viewing', 0.2960418462753296),
 ('rounded', 0.2866533696651459),
 ('mobility',

## *Printing one of the benefits column word's vector*

In [30]:
# suppose we want to know the vector embedding of a word 'pain' so we have printed the vector embedding of size 50
import numpy as np
print((model.wv.get_vector('pain')))

[-2.657207   -5.3306184  -2.5291793   2.3216712   3.2551565  -1.2878358
  1.8020716   4.6101656   6.881413    0.41921583 -2.7873347  -4.961429
 -4.731342   -4.070579   -0.19011754 -0.68716663 13.895351    0.26979095
  0.5524215  -2.9804509   6.483268    1.2062806  -6.5132995   6.8649564
  3.766424   -5.769363   -4.0692024   5.9204144   5.7346525  -1.0251939
 -0.8622066   1.5009724   6.0800653   0.13556302  1.368434   -2.0390646
 -3.8611789   8.715522    7.542811    0.88340485 -4.65341     1.5053214
  0.17831945  1.2804779   3.5438156  -4.523641   -3.736478   10.50533
 -4.4181476   9.127459  ]


In [31]:
# counts total number of rows in datasets being trained
model.corpus_count

326

In [32]:
from gensim.models import Word2Vec
# created list of unique words from the column Benefits. 
words = list(w for w in model.wv.vocab)


In [33]:
# printing unique words from the vocabulary list.
print(words)

['asana', 'strengthens', 'abdominal', 'muscles', 'massages', 'organs', 'digestive', 'system', 'lower', 'back', 'pelvic', 'helps', 'correct', 'prolapse', 'pose', 'nerves', 'limbs', 'increase', 'height', 'stretching', 'ligaments', 'enabling', 'growing', 'bones', 'grow', 'longer', 'circulation', 'stimulated', 'especially', 'upper', 'spine', 'shoulder', 'blades', 'excellent', 'preparatory', 'practice', 'loosening', 'knee', 'hip', 'joints', 'meditative', 'poses', 'people', 'sit', 'legged', 'practise', 'ardha', 'daily', 'morning', 'removes', 'stiffness', 'increases', 'flexibility', 'sideways', 'viewing', 'relaxes', 'tension', 'close', 'work', 'prevents', 'corrects', 'effective', 'suffering', 'slipped', 'disc', 'sciatica', 'certain', 'pain', 'remain', 'extended', 'periods', 'time', 'encourages', 'column', 'resume', 'normal', 'shape', 'releases', 'compression', 'spinal', 'lung', 'ailments', 'simple', 'regularly', 'breath', 'awareness', 'allows', 'air', 'lungs', 'padmasana', 'body', 'held', 'co

In [34]:
# here we have the length of unique words

print(len(words))

1330


## *Created an empty dictionary at first to store the unique words as key words  along with its vector embeddings.*

In [35]:
dict_of_word_embeddings = dict({})
for i in words:
    dict_of_word_embeddings[i] = model.wv[i]

In [36]:
# on printing the dictionary, we get the following result.
print(dict_of_word_embeddings)

{'asana': array([-2.163333  ,  0.5466037 , -2.1510348 , -3.4011986 , -0.5919765 ,
        0.39208984, -2.0570285 , -1.3823998 ,  0.16649637, -0.7450882 ,
        3.4214804 , -1.9632583 ,  2.7653549 ,  0.16144814, -2.1197274 ,
        0.6843712 ,  1.0596869 , -0.7324595 ,  0.36118045,  0.98641247,
       -0.9341465 , -1.2980409 ,  0.00600418, -1.7086535 , -1.7443091 ,
        1.1769068 , -0.17124102,  1.3828862 , -2.129091  , -3.464459  ,
       -0.593108  ,  0.9324136 ,  3.1409686 , -0.11192907, -1.1271609 ,
        0.24242683,  1.1180729 ,  1.0981237 ,  2.9780755 , -4.8163133 ,
        0.4407236 ,  1.2232382 , -0.8504027 ,  0.171244  , -2.4682841 ,
        2.077756  ,  1.4232829 ,  1.6479242 , -1.2355232 , -0.12048768],
      dtype=float32), 'strengthens': array([-2.9988804 , -2.6252797 , -3.5853152 , -3.6666937 , -0.69188714,
       -1.5764053 , -1.7690184 , -1.4073198 , -0.9547447 ,  1.6433693 ,
        3.126042  , -0.8285296 , -0.21244632,  0.8360189 ,  0.02579051,
       -0.521805

## *Exporting dictionary data into separate excel sheet/ csv file*.

In [37]:
Unique_words = dict_of_word_embeddings.keys()
word_vectors  = dict_of_word_embeddings.values()
# print(asanas, word_vectors)
d = {'Unique_words' : Unique_words , 'Word_Vectors' : word_vectors}
dataframe = pd.DataFrame(data = d)
dataframe

Unnamed: 0,Unique_words,Word_Vectors
0,asana,"[-2.163333, 0.5466037, -2.1510348, -3.4011986,..."
1,strengthens,"[-2.9988804, -2.6252797, -3.5853152, -3.666693..."
2,abdominal,"[-5.5133376, -1.605097, -1.8853691, 0.8624429,..."
3,muscles,"[-2.566046, -1.2490188, -4.498116, -0.69691247..."
4,massages,"[-3.048516, 2.793287, -8.5561695, -3.36328, -5..."
...,...,...
1325,nectar,"[4.956446, -3.4838462, 2.2951546, 2.3540275, -..."
1326,trataka,"[-4.147821, 7.664687, 0.48258972, -4.7921586, ..."
1327,buttock,"[0.78801036, 0.9004252, 3.6735587, 2.4885817, ..."
1328,mandalasana,"[4.750417, 0.34923574, 5.1676593, -3.0388665, ..."


## *We have made list of unique asanas as after merging excel sheets there were repeated asanas*

In [39]:
asanas = list(df['Asana'])
# total asanas present ( with repetition)
print(len(asanas))
asana = []
      

for x in asanas:
  if x not in asana:
    asana.append(x)
# total number of unique asanas
print(len(asana))    
# list of unique asanas
print(asana)

326
293
['padotthanasana', 'parvatasana', 'ardha titali asana', 'gatyatmak meru vakrasana', 'sideways viewing', 'makarasana', 'padmasana', 'vajrasana', 'ardha chandrasana', 'yogamudrasana', 'bhujangasana', 'saithalyasana', 'bhu namanasana', 'sarvangasana', 'natarajasana', 'poorna bhujangasana', 'koormasana', 'poorna shalabhasana', 'poorna dhanurasana', 'bandha hasta utthanasana ', 'shava udarakarshanasana ', 'chakki chalanasana ', 'kashtha takshanasana ', 'vayu nishkasana', 'ushtrasana', 'samakonasana ', 'matsyasana', 'kandharasana', ' setu asana ', 'paschimottanasana', 'meru akarshanasana', 'pada hastasana', 'seetkari pranayama', 'jalandhara bandha', 'tadagi mudra', 'maha vedha mudra', 'shashankasana', 'janu chakra', 'poorna titali asana', 'manibandha chakra', 'skandha chakra', 'greeva sanchalana', 'padachakrasana', 'pada sanchalanasana', 'supta pawanmuktasana', 'jhulana lurhakanasana', 'supta udarakarshanasana', 'naukasana', 'rajju karshanasana', 'nauka sanchalanasana', 'namaskarasan

## *On-hot Encodded the asanas words and created a dictionary to store the asana word with its word embedding*

In [40]:
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(asana)
#print(integer_encoded)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)

### One hot encoding
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

# onehot_encoded

asan_dict={}

for i in range(len(asana)):
  asan_dict[asana[i]] = onehot_encoded[i]

print(asan_dict) 


{'padotthanasana': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
      

In [41]:
from tensorflow import keras
from tensorflow.keras.layers import Dense

In [42]:
# printing the benefit words of one row.
print(benefits[0])

['asana', 'strengthens', 'abdominal', 'muscles', 'massages', 'organs', 'strengthens', 'digestive', 'system', 'lower', 'back', 'pelvic', 'perineal', 'muscles', 'helps', 'correct', 'prolapse']


## *Created a Tupple associating the benefits words of each row with the respective asana. In order to bring the associativity between benefits words and asana words*

In [43]:
pair=[]

i=0
a=len(asana)
for x in benefits:
  if(i<a):
    target=asana[i]
    for y in x:
      if(y not in words):
        continue
      pair.append((y,target))
  i+=1  
print(pair)




[('asana', 'padotthanasana'), ('strengthens', 'padotthanasana'), ('abdominal', 'padotthanasana'), ('muscles', 'padotthanasana'), ('massages', 'padotthanasana'), ('organs', 'padotthanasana'), ('strengthens', 'padotthanasana'), ('digestive', 'padotthanasana'), ('system', 'padotthanasana'), ('lower', 'padotthanasana'), ('back', 'padotthanasana'), ('pelvic', 'padotthanasana'), ('muscles', 'padotthanasana'), ('helps', 'padotthanasana'), ('correct', 'padotthanasana'), ('prolapse', 'padotthanasana'), ('pose', 'parvatasana'), ('strengthens', 'parvatasana'), ('nerves', 'parvatasana'), ('muscles', 'parvatasana'), ('limbs', 'parvatasana'), ('back', 'parvatasana'), ('helps', 'parvatasana'), ('increase', 'parvatasana'), ('height', 'parvatasana'), ('stretching', 'parvatasana'), ('muscles', 'parvatasana'), ('ligaments', 'parvatasana'), ('enabling', 'parvatasana'), ('growing', 'parvatasana'), ('bones', 'parvatasana'), ('grow', 'parvatasana'), ('longer', 'parvatasana'), ('circulation', 'parvatasana'), 

## *Making a 2-D array of context words(benefit words) and Target words(asana words) by numpy stack*

In [46]:
contexts=[dict_of_word_embeddings[context] for context,target in pair]
contexts=np.vstack(contexts)
# shape of the context words matrix
contexts.shape


(12085, 50)

In [47]:
targets=[asan_dict[target] for context,target in pair]
targets=np.vstack(targets)
# shape of the target words matrix

targets.shape

(12085, 293)

## *Implemented **Artificial Neural Network**. with the help of tensorfow's keras functional API with the following features:*

* Number of input layer = 1
* Size of input layer = 50 unit
* Number of dense layer = 1
* size of dense layer = 1000 units
* Number of output layer = 1
* size of output unit = 293
* activation function = softmax
* loss = categorical_crossentropy
* optimizer = adam
* number of epotchs = 100

In [48]:
from tensorflow import keras
from tensorflow.keras.layers import Dense

network_input = keras.Input(shape=contexts.shape[1], name='input_layer')
                                                                                    # Create a hidden layer for the network; store under 'hidden_layer'
hidden_layer1 = Dense(units=1000, activation='sigmoid', name='hidden_layer1')(network_input)

                                                                                            # Create an output layer for the network; store under 'output_layer'
output_layer = Dense(units=targets.shape[1], activation='softmax', name='output_layer')(hidden_layer1)

                                                                          # Create a Keras Model; store under 'embedding_model'
embedding_model = keras.Model(inputs=network_input, outputs=output_layer)

                                                          # Compile the model for training; define loss function
embedding_model.compile(loss='categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy'])

                                                          # Print out a summary of the model
embedding_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 50)]              0         
                                                                 
 hidden_layer1 (Dense)       (None, 1000)              51000     
                                                                 
 output_layer (Dense)        (None, 293)               293293    
                                                                 
Total params: 344,293
Trainable params: 344,293
Non-trainable params: 0
_________________________________________________________________


In [49]:
# Fit a model to the data
embedding_model.fit(x=contexts,   # inputs
                    y=targets,   # outputs
                    batch_size=1024,  # how many pairs of words processed simultaneously
                    epochs=100,   # how many times we loop through the whole data
                    verbose=1   # do not print training status
                   )



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f39f03d6090>

## *Function which inputs the user details and suggests user with the most recommended asanas*

In [50]:
from collections import Counter
from IPython.display import clear_output

def magic():
  predicted_asanas = []
  user_input_words= []
  final_predicted_asanas = []
  number_in_words = ['first','second', 'third', 'fourth']
  for i in range(4):
    user_input_words.append(input(f"Enter {number_in_words[i]} benefit word:  "))
  for i in user_input_words:
    if i in dict_of_word_embeddings:

      input_array = np.expand_dims(dict_of_word_embeddings[i], axis=0)
      prediction = embedding_model.predict(input_array)
      flatten_pred = prediction.flatten()
      result_indices = flatten_pred.argsort()[-10:][::-1]
    
      for result in result_indices:
        predicted_asanas.append(asana[result])
    
    
  counter_found = Counter(predicted_asanas)
  final_predicted_asanas_with_freq = counter_found.most_common(7)
  # print(final_predicted_asanas_with_freq)

  for yoga, freq in final_predicted_asanas_with_freq:
    final_predicted_asanas.append(yoga)
  
  print(final_predicted_asanas)
  choice=input("Clear output: Y/N ")
  if choice=='Y':
   clear_output()
  
  
magic()

Enter first benefit word:  sciatica
Enter second benefit word:  pain
Enter third benefit word:  back
Enter fourth benefit word:  strengthen
['mandalasana', 'parivritti janu sirshasana', ' kapali asana', 'kati chakrasana', ' moolabandhasana', 'dwi hasta bhujasana', 'utthita lolasana']
Clear output: Y/N N
