### **Mounting Google Drive to upload Datasets**
___


In [0]:
from google.colab import drive
drive.mount('/content/drive')

### **Importing the required libraries**
___

In [0]:
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
import nltk
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import one_hot

### **Loading the Dataset**
___

In [0]:
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Mylo/train.csv", encoding = 'unicode_escape')

In [0]:
df = df.drop('post_id', axis = 1)

In [0]:
df.head()

Unnamed: 0,question,user_stage,tag
0,"Alhamdulilh, blessed with beautiful baby girl...",pregnant,Announcements&Celebrations
1,Breastfeeding mother ko chai pini chaiye usse ...,mother,General Baby
2,Hii all as I am preganent with twins on coming...,pregnant,General Pregnancy
3,Mujhy mild cramp sa feeling Hai lower main aur...,pregnant,General Pregnancy
4,Can I drink lion dates syrup,pregnant,Diet&Nutrition


### **Creating input and output feature Columns**

In [0]:
corpus = df.iloc[:,0].values
y = df.iloc[:,2].values

In [0]:
labelencoder = LabelEncoder()
onehotencoder = OneHotEncoder()
y = labelencoder.fit_transform(y)
y = y.reshape(-1,1)
y = onehotencoder.fit_transform(y).toarray()

**Using Word Embedding Method**

In [0]:
nltk.download('punkt')
all_words = []
for sent in corpus:
    tokenize_word = word_tokenize(sent)
    for word in tokenize_word:
        all_words.append(word)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
unique_words = set(all_words)
print(len(unique_words))

1216


In [0]:
vocab_length = len(unique_words)

In [0]:
embedded_sentences = [one_hot(sent, vocab_length) for sent in corpus]
print(embedded_sentences )

[[192, 433, 671, 646, 198, 501, 267], [1175, 1049, 4, 806, 248, 871, 843, 1197, 4, 877, 773], [1098, 625, 499, 756, 906, 822, 671, 972, 264, 785, 107, 456, 167, 454, 58, 574, 1145, 772, 870, 499, 263, 177, 705, 471, 574, 1039, 797, 371, 345, 574, 1145, 648, 499, 756, 1154, 219, 736, 582, 245, 175, 1065, 794, 116, 797, 1094, 882, 648, 167, 657, 1189, 278, 1175, 166, 574, 706, 736, 262, 278, 454, 58, 574, 706, 1024, 938, 1075, 538, 720, 54, 454, 398, 198, 1092, 1096, 643, 574, 805, 966, 948, 206, 756, 1199, 54, 197, 906, 16], [894, 729, 665, 857, 703, 774, 1194, 111, 1191, 614, 459, 165, 757, 813, 572, 46, 464, 564, 1070, 614, 95, 703, 459, 996, 167, 167, 837, 22, 986, 774], [622, 756, 84, 771, 667, 387], [266, 622, 6, 741, 1088, 463, 1130, 38, 756, 906, 831, 574, 1173, 279, 463, 885, 1130, 581, 221, 868, 398], [622, 756, 977, 77, 192], [756, 906, 264, 34, 1075, 299, 876, 46, 506, 265, 877, 299, 1017, 705, 574, 877, 284, 797, 962, 582, 794, 754, 930, 664, 611, 206], [756, 58, 1047, 1100,

In [0]:
word_count = lambda sentence: len(word_tokenize(sentence))
longest_sentence = max(corpus, key=word_count)
length_long_sentence = len(word_tokenize(longest_sentence))

In [0]:
padded_sentences = pad_sequences(embedded_sentences, length_long_sentence, padding='post')
print(padded_sentences)

[[ 192  433  671 ...    0    0    0]
 [1175 1049    4 ...    0    0    0]
 [1098  625  499 ...    0    0    0]
 ...
 [1140  293  103 ...    0    0    0]
 [  22 1017   44 ...    0    0    0]
 [ 756  165  687 ...    0    0    0]]


### **Adding Column *'user_stage'* in the input features *'padded_sentences'***

In [0]:
df_temp=pd.DataFrame(data=padded_sentences[0:,0:], index=[i for i in range(padded_sentences.shape[0])], columns=['f'+str(i) for i in range(padded_sentences.shape[1])])

In [0]:
df_temp['f' + str(padded_sentences.shape[1])] = df['user_stage']

In [0]:
df_temp.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,...,f120,f121,f122,f123,f124,f125,f126,f127,f128,f129,f130,f131,f132,f133,f134,f135,f136,f137,f138,f139,f140,f141,f142,f143,f144,f145,f146,f147,f148,f149,f150,f151,f152,f153,f154,f155,f156,f157,f158,f159
0,192,433,671,646,198,501,267,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,pregnant
1,1175,1049,4,806,248,871,843,1197,4,877,773,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,mother
2,1098,625,499,756,906,822,671,972,264,785,107,456,167,454,58,574,1145,772,870,499,263,177,705,471,574,1039,797,371,345,574,1145,648,499,756,1154,219,736,582,245,175,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,pregnant
3,894,729,665,857,703,774,1194,111,1191,614,459,165,757,813,572,46,464,564,1070,614,95,703,459,996,167,167,837,22,986,774,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,pregnant
4,622,756,84,771,667,387,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,pregnant


In [0]:
X = df_temp.iloc[:, :].values

In [0]:
le1 = LabelEncoder()
X[:, 159] = le1.fit_transform(X[:, 159])

In [0]:
new_length_long_sentence = length_long_sentence + 1 

### **Building Model**
___


In [0]:
model = Sequential()
model.add(Embedding(vocab_length, 50, input_length=new_length_long_sentence))
model.add(Flatten())
model.add(Dense(7, activation='softmax'))






In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 160, 50)           60800     
_________________________________________________________________
flatten_1 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 56007     
Total params: 116,807
Trainable params: 116,807
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model.fit(X, y, batch_size =  10, nb_epoch = 100)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



  """Entry point for launching an IPython kernel.




Epoch 1/100





Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
E

<keras.callbacks.History at 0x7f6915aefbe0>

In [0]:
loss, accuracy = model.evaluate(X, y)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 100.000000


### **Predicting Results for "test dataset"**
___


In [0]:
df_test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Mylo/test.csv", encoding = 'unicode_escape')
df_test = df_test.drop('post_id', axis = 1)

In [0]:
corpus_test = df_test.iloc[:,0].values
nltk.download('punkt')
all_words_test = []
for sent in corpus_test:
    tokenize_word_test = word_tokenize(sent)
    for word_test in tokenize_word_test:
        all_words_test.append(word_test)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
embedded_sentences_test = [one_hot(sent, vocab_length) for sent in corpus_test]
print(embedded_sentences_test)

[[709, 574, 1183, 794, 1027, 1114], [756, 251, 1154, 736, 70, 1130, 764, 741, 889, 1065, 139, 756, 767, 574, 786, 948, 1158, 442, 387, 837, 697, 737, 134, 1159, 680, 400, 359, 464, 582, 461, 774, 95, 442, 163, 288, 498, 1151, 227, 526, 23, 989, 1078, 3, 621, 574, 40, 461, 774, 948, 688, 359, 288, 582, 461, 774, 797, 371, 279, 198, 564, 288, 852, 558, 510, 93, 998, 191, 442, 44, 1006, 333, 288, 116, 240, 774, 1097, 998, 1078, 574, 498, 213, 294], [326, 756, 906, 219, 1100, 407, 496, 1065, 756, 906, 462, 198, 664, 77, 1065, 284, 797, 200, 870, 756, 1154, 338, 600, 139, 204, 98, 574, 167, 326], [658, 1008, 1050, 937, 930, 703, 860, 165, 198], [313, 837, 262, 797, 767, 574, 43, 1047, 1211, 930, 353, 582, 851, 1192, 289, 154, 864, 116, 804, 99, 59, 253, 876, 341, 717], [216, 884, 156, 4, 390, 1118, 774, 398, 1101, 4, 44, 670, 279, 611, 1083, 502], [1147, 590, 70, 326, 235, 756, 706, 502, 1088, 398, 299, 326, 62, 387], [1140, 625, 946, 6, 1191, 574, 209, 1189, 794, 198, 410, 739, 524, 1130, 

In [0]:
padded_sentences_test = pad_sequences(embedded_sentences_test, length_long_sentence, padding='post')
print(padded_sentences_test)

[[ 709  574 1183 ...    0    0    0]
 [ 756  251 1154 ...    0    0    0]
 [ 326  756  906 ...    0    0    0]
 ...
 [ 830  797  219 ...    0    0    0]
 [ 122  671  198 ...    0    0    0]
 [ 709  574 1183 ...    0    0    0]]


In [0]:
df_temp_test=pd.DataFrame(data=padded_sentences_test[0:,0:], index=[i for i in range(padded_sentences_test.shape[0])], columns=['f'+str(i) for i in range(padded_sentences_test.shape[1])])
df_temp_test['f' + str(padded_sentences_test.shape[1])] = df_test['user_stage']
X_test = df_temp_test.iloc[:, :].values
X_test[:, 159] = le1.fit_transform(X_test[:, 159])

In [0]:
y_pred = model.predict(X_test)
y_pred_final = np.argmax(y_pred, axis=1)

In [0]:
y_pred_final

array([6, 3, 3, 4, 2, 6, 1, 2, 2, 6, 3, 6, 3, 6, 3, 2, 0, 5, 0, 3, 6, 6,
       2, 2, 2, 6, 0, 6, 4, 6, 6, 6, 0, 6])

In [0]:
y_result = labelencoder.inverse_transform(y_pred_final)
y_result=pd.DataFrame(data = y_result)
# Loading the dataset
df1 = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Mylo/test.csv", encoding = 'unicode_escape')
df1['tag'] = y_result

In [0]:
df1.head()

Unnamed: 0,post_id,question,user_stage,tag
0,75,Unable to upload my profile pic,pregnant,MyloSupport
1,90,I dont have any problm in morning whole day bu...,pregnant,General Pregnancy
2,94,Hi I am not conformed the pregnancy but I am e...,pregnant,General Pregnancy
3,102,Best fresh fruit juice for six months old baby,mother,Gossip
4,108,Which storage bag is good to store breast milk...,mother,General Baby


In [0]:
df1.to_csv("jain.shubham102.csv")