# DL model 

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,Flatten,Activation
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import pickle
features=pickle.load(open("projectfeatures.pkl","rb"))
labels=pickle.load(open("projectlabels.pkl","rb"))

In [3]:
def Randomize(feature,label):
  global new_features,new_labels
  feature=np.array(feature)
  label=np.array(label)
  np.random.seed(42)
  new_index=np.random.permutation(len(feature))
  new_features=feature[new_index]
  new_labels=label[new_index]

Randomize(features,labels)

In [4]:
data=pd.DataFrame({"sentences":new_features,"class":new_labels})
data.head()

Unnamed: 0,sentences,class
0,list of streams in vishwatam,VWG-1
1,indala civil seats,IDC-4
2,dj sangvhi list of all courses,DJS-1
3,viva college BE electronics cutoff this year,VIVA-6
4,list of streams in rizwi,RW-1


In [5]:
import nltk
from nltk.stem import WordNetLemmatizer
import re 

Lm=WordNetLemmatizer()

corpus=[]
for i in range(len(data["sentences"])):
  sentence=re.sub("[^a-zA-Z]"," ",data["sentences"][i])
  sentence=sentence.lower()
  sentence=sentence.split()
  sentence=[Lm.lemmatize(words) for words in sentence]
  sentence=" ".join(sentence)
  corpus.append(sentence)

In [6]:
class_dic={code:i for i,code in enumerate(data["class"].unique())}
class_dic

{'VWG-1': 0,
 'IDC-4': 1,
 'DJS-1': 2,
 'VIVA-6': 3,
 'RW-1': 4,
 'FRC-4': 5,
 'GVA-4': 6,
 'WTM-4': 7,
 'VL-3': 8,
 'STJN-3': 9,
 'SMT-3': 10,
 'BLM-2': 11,
 'TSG-2': 12,
 'SP-6': 13,
 'VSHW-5': 14,
 'KGC-6': 15,
 'TERNA-2': 16,
 'TERNA-5': 17,
 'GMV-3': 18,
 'STW-6': 19,
 'STW-7': 20,
 'TERNA-4': 21,
 'MHSS-4': 22,
 'SARASKHAR-5': 23,
 'RJSH-5': 24,
 'VPM-4': 25,
 'THN-5': 26,
 'VES-6': 27,
 'TSG-3': 28,
 'SSJ-1': 29,
 'SP-8': 30,
 'VL-5': 31,
 'VJTI-4': 32,
 'KC-1': 33,
 'DBS-1': 34,
 'DM-1': 35,
 'SSJ-5': 36,
 'DBS-4': 37,
 'LRT-3': 38,
 'DJS-8': 39,
 'VIDYAWARDHINI-7': 40,
 'MGM-8': 41,
 'VWG-2': 42,
 'RJM-6': 43,
 'VPM-3': 44,
 'ARMT-4': 45,
 'TC-4': 46,
 'MHSS-6': 47,
 'TC-8': 48,
 'LLW-2': 49,
 'SP-2': 50,
 'PC-2': 51,
 'GMV-2': 52,
 'U-2': 53,
 'PRA-5': 54,
 'FNL-7': 55,
 'IDC-1': 56,
 'TC-5': 57,
 'MHSS-7': 58,
 'BLM-1': 59,
 'FRC-1': 60,
 'VIVA-1': 61,
 'RG-5': 62,
 'THEM-3': 63,
 'VES-1': 64,
 'THEM-4': 65,
 'MGM-6': 66,
 'JND-7': 67,
 'Automobile': 68,
 'DLP-6': 69,
 'RJM-

In [7]:
data["class_code"]=data["class"].map(class_dic)
data.head()

Unnamed: 0,sentences,class,class_code
0,list of streams in vishwatam,VWG-1,0
1,indala civil seats,IDC-4,1
2,dj sangvhi list of all courses,DJS-1,2
3,viva college BE electronics cutoff this year,VIVA-6,3
4,list of streams in rizwi,RW-1,4


In [35]:
data["class_code"].value_counts().sort()

105     2
204     3
306     5
114     5
293     5
       ..
64     12
126    12
118    12
258    12
80     16
Name: class_code, Length: 394, dtype: int64

In [23]:
vocab_size=5000

one_hot_object=[one_hot(words,vocab_size) for words in corpus]
one_hot_object

[[550, 1449, 4274, 2738, 2156],
 [4738, 3113, 2257],
 [2194, 3834, 550, 1449, 2137, 3039],
 [4150, 2635, 4841, 1267, 3482, 4371, 4704],
 [550, 1449, 4274, 2738, 4219],
 [3186, 2257, 2738, 2256],
 [940, 3624, 4841, 3113, 3482, 4371, 4704],
 [2185, 2635, 2526, 2257],
 [4787, 2635, 4841, 4089, 3482, 4371, 4704],
 [1714, 1085, 2635, 2526, 2257],
 [3236, 2257, 2738, 529, 1957, 2498],
 [176, 4431, 4254, 1449, 2046],
 [4738, 2635, 3113, 2257],
 [3545, 2635, 4841, 4089, 3482, 4371, 4704],
 [4953, 1291, 2635, 1267, 2257],
 [2526, 2257, 2738, 2591],
 [4089, 4431, 4254, 1449, 705, 392],
 [176, 1293, 2738, 3866, 2635],
 [3866, 4841, 1736, 3482, 4371, 4704],
 [4089, 2257, 2738, 379, 2664],
 [1714, 3634, 4841, 1736, 3482, 4371, 4704],
 [1714, 3634, 2526, 2257],
 [3866, 2526, 2257],
 [4089, 2257, 2738, 4298, 3627],
 [4361, 2635, 2526, 2257],
 [3935, 1380, 2635, 4841, 1267, 3482, 4371, 4704],
 [3113, 2257, 2738, 276],
 [1236, 4841, 2526, 3482, 4371, 4704],
 [1869, 1736, 2257],
 [176, 1293, 2738, 3545,

In [24]:
sent_length=12
embedded_words=pad_sequences(
    sequences=one_hot_object,
    maxlen=sent_length,
    padding="pre"
)
print(len(embedded_words))

3813


In [25]:
embedding_vector_features=40
model=Sequential()

model.add(Embedding(
      input_length=sent_length,
      input_dim=vocab_size,
      output_dim=embedding_vector_features
  ))

model.add(LSTM(
      units=222
  ))
  
model.add(Flatten())
  
model.add(Dense(
      units=204
  ))
  
model.add(Dense(
      units=394,
      activation="softmax"
  ))

model.compile(
      optimizer=tf.keras.optimizers.Adam(0.001),
      loss="sparse_categorical_crossentropy",
      metrics=["accuracy"]
  )

In [51]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 12, 40)            200000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 222)               233544    
_________________________________________________________________
flatten_1 (Flatten)          (None, 222)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 204)               45492     
_________________________________________________________________
dense_3 (Dense)              (None, 394)               80770     
Total params: 559,806
Trainable params: 559,806
Non-trainable params: 0
_________________________________________________________________


In [None]:
from imblearn.over_sampling import RandomOverSampler

oversample=RandomOverSampler()
embedded_words_res,Y_data_res=oversample.fit_resample(embedded_words,Y_data,strategy="all")

In [27]:

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(embedded_words_res,Y_data_res,test_size=0.2,random_state=42)

In [28]:
model.fit(
    X_train,
    Y_train,
    validation_split=0.15,
    epochs=30
)

Train on 4286 samples, validate on 757 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2b47a080>

In [29]:
results=model.evaluate(X_test,Y_test,batch_size=30)
print("test accuracy",results)



test accuracy [0.11584561618600184, 0.9563838]


In [30]:
results=model.evaluate(X_train,Y_train,batch_size=35)
print("train accuracy",results)



train accuracy [0.052467694990039566, 0.9726353]


In [53]:
model.save("newprojectmodel123.h5")

In [46]:

def Process(sent):
        sent_length=12
        vocab_size1=5000
        review1=re.sub("[^a-zA-Z]"," ",sent)
        review1=[review1.lower()]
        print(review1)
        reviews1=[Lm.lemmatize(words) for words in review1]
        print(reviews1)
        one_hot_words1=[one_hot(words,vocab_size1) for words in reviews1]
        embedded_sent1=pad_sequences(one_hot_words1,padding="pre",maxlen=sent_length)
        print(embedded_sent1)
        return embedded_sent1


if __name__=="__main__":
    model_test=modelload_model("newprojectmodel123.h5")
    while True:
        sent=input("type:")
        if sent=="1":
            break
        else:            
            n=Process(sent)
            y=model_test.predict(n)
            print(np.argmax(y))
            print(list(class_dic)[np.argmax(y)])

type:which courses are there in universal college
['which courses are there in universal college']
['which courses are there in universal college']
[[   0    0    0    0    0 3505 3796 3095 2566 2738  627 2635]]
131
U-1
type:1


In [21]:
filename="process.pkl"
pickle.dump(Process,open(filename,"wb"))