# ML model

In [32]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

In [33]:
import pickle
features=pickle.load(open("projectfeatures.pkl","rb"))
labels=pickle.load(open("projectlabels.pkl","rb"))

In [34]:
def Randomize(feature,label):
  global new_features,new_labels
  feature=np.array(feature)
  label=np.array(label)
  np.random.seed(42)
  new_index=np.random.permutation(len(feature))
  new_features=feature[new_index]
  new_labels=label[new_index]

Randomize(features,labels)

In [36]:
data=pd.DataFrame({"sentences":new_features,"class":new_labels})
data.head()

Unnamed: 0,sentences,class
0,list of streams in vishwatam,VWG-1
1,indala civil seats,IDC-4
2,dj sangvhi list of all courses,DJS-1
3,viva college BE electronics cutoff this year,VIVA-6
4,list of streams in rizwi,RW-1


In [61]:
from nltk.stem import WordNetLemmatizer
import re 

Lm=WordNetLemmatizer()

corpus=[]
for i in range(len(data["sentences"])):
  sentence=re.sub("[^a-zA-Z]"," ",data["sentences"][i])
  sentence=sentence.lower()
  sentence=sentence.split()
  sentence=[Lm.lemmatize(words) for words in sentence]
  sentence=" ".join(sentence)
  corpus.append(sentence)
corpus=np.array(corpus)
data["corpus"]=corpus

In [87]:
class_dic={code:i for i,code in enumerate(data["class"].unique())}
data["class"]=data["class"].map(class_dic)
data.head()

Unnamed: 0,sentences,class,corpus
0,list of streams in vishwatam,0,list of stream in vishwatam
1,indala civil seats,1,indala civil seat
2,dj sangvhi list of all courses,2,dj sangvhi list of all course
3,viva college BE electronics cutoff this year,3,viva college be electronics cutoff this year
4,list of streams in rizwi,4,list of stream in rizwi


In [164]:
from imblearn.over_sampling import RandomOverSampler

oversample=RandomOverSampler(sampling_strategy="all")
embedded_words_res,Y_data_res=oversample.fit_resample(data["corpus"].values.reshape(-1,1),data["class"].values.reshape(-1,1))

In [201]:
df1=pd.DataFrame(embedded_words_res)
df2=pd.DataFrame(Y_data_res)
frames=[df1,df2]
df0=pd.concat(frames,axis=1)
df0.columns=["features","labels"]
df0

Unnamed: 0,features,labels
0,list of stream in vishwatam,0
1,indala civil seat,1
2,dj sangvhi list of all course,2
3,viva college be electronics cutoff this year,3
4,list of stream in rizwi,4
...,...,...
6299,vidyavardhini civil seat,393
6300,civil seat in vidyavardhini,393
6301,vidyavardhini civil seat,393
6302,vidyavardhini be civil cutoff this year,393


In [202]:
x_train,x_test,y_train,y_test=train_test_split(df0["features"],df0["labels"], test_size=0.25, random_state=7, shuffle=True)
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.75)

In [203]:
vec_train=tfidf_vectorizer.fit_transform(x_train.values.astype('U'))
vec_test=tfidf_vectorizer.transform(x_test.values.astype('U'))

In [231]:
model=MultinomialNB(alpha=0.0002)
model.fit(vec_train,y_train)
score=cross_val_score(model,vec_train,y_train,cv=10,n_jobs=-1,scoring='accuracy',verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.6s finished


In [236]:
print("accuracy_croos_val",score.mean())

accuracy_croos_val 0.9509325617228651


In [237]:
y=model.predict(vec_test)
from sklearn.metrics import accuracy_score
print(f"accuracy: {accuracy_score(y_test,y)}")

accuracy: 0.9416243654822335


In [238]:
while True:
    sent=input("type anything: ")
    if sent=="1":
        break
    else:
        sent=[sent]
        sent=np.array(sent).astype('U')
        v=tfidf_vectorizer.transform(sent)
        print(v)
        print(v.shape)
        y=model.predict(v)
        print(y)
        for i in y:
            print(list(class_dic)[i])

type anything: hi can you help me plzz
  (0, 48)	1.0
(1, 128)
[336]
greeting
type anything: show me courses offered in saboo siddik college
  (0, 98)	0.5752697681994885
  (0, 89)	0.5752697681994885
  (0, 74)	0.547274753138529
  (0, 16)	0.1965190376746622
(1, 128)
[261]
MHSS-1
type anything: what are the courses in tasgaonkar collge
  (0, 103)	1.0
(1, 128)
[234]
TSG-6
type anything: what are courses offered in tasgaonkar college
  (0, 103)	0.7140093903267133
  (0, 74)	0.6589408624990379
  (0, 16)	0.23661684271052885
(1, 128)
[238]
TSG-1
type anything: instrumental cut off of smt indira gandhi
  (0, 100)	0.4999979763940827
  (0, 52)	0.4806858776285225
  (0, 51)	0.4999979763940827
  (0, 39)	0.43726233086664074
  (0, 20)	0.27883111063368493
(1, 128)
[10]
SMT-3
type anything: sies Btech production cutoff this year
  (0, 127)	0.2916187112302949
  (0, 99)	0.5182143653065817
  (0, 80)	0.523096912159069
  (0, 21)	0.2916187112302949
  (0, 13)	0.536414528699874
(1, 128)
[389]
SIES-3
type anything

In [239]:
filename="projecttfidf.pkl"
pickle.dump(tfidf_vectorizer,open(filename,"wb"))

filename="projectmodel.pkl"
pickle.dump(model,open(filename,"wb"))