In [41]:
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# import rb
import pickle

In [42]:
np.random.seed(5000)

In [43]:
Corpus = pd.read_csv(r"data/student_evaluation.csv",encoding='latin-1')

In [44]:
# Step - a : Remove blank rows if any.
Corpus['comment'].dropna(inplace=True)

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['comment'] = [entry.lower() for entry in Corpus['comment']]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['comment']= [word_tokenize(entry) for entry in Corpus['comment']]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['comment']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

In [46]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['suggestion'],test_size=0.2)

In [47]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
# Test_X.describe

In [7]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [8]:
print(Train_X_Tfidf)

  (0, 665)	0.2598159672510975
  (0, 410)	0.33826869738274634
  (0, 365)	0.44949699116306413
  (0, 347)	0.5007560409949183
  (0, 346)	0.5007560409949183
  (0, 320)	0.2849834611612777
  (0, 141)	0.1824653614975694
  (1, 686)	0.8229147482846391
  (1, 300)	0.5681648678470266
  (2, 564)	0.44553047205843427
  (2, 562)	0.4999686015300664
  (2, 312)	0.4773747364087423
  (2, 251)	0.36849847746547804
  (2, 173)	0.43342384463221095
  (3, 457)	0.47735078787639595
  (3, 360)	0.6643331455672428
  (3, 358)	0.3499830438696258
  (3, 303)	0.45640942805585016
  (4, 174)	1.0
  (5, 637)	0.356646343420371
  (5, 632)	0.45848546668000967
  (5, 520)	0.36462127949202117
  (5, 251)	0.3176891858980269
  (5, 98)	0.2476733967525925
  (5, 58)	0.39644461360622546
  :	:
  (507, 431)	0.3701355070971858
  (507, 264)	0.4375505189777045
  (507, 240)	0.2840297653024115
  (507, 233)	0.34824733237071803
  (507, 184)	0.3793134760378173
  (508, 763)	0.24473795192504946
  (508, 710)	0.24075436751885299
  (508, 693)	0.3388473045

In [9]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy


print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)


# predictions_NB
Test_Y

Naive Bayes Accuracy Score ->  82.17054263565892


array([0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [10]:

# fit the training dataset on the NB classifier
BNaive = naive_bayes.BernoulliNB()
BNaive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
Bpredictions_NB = BNaive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(Bpredictions_NB, Test_Y)*100)



Naive Bayes Accuracy Score ->  79.84496124031007


In [11]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)



SVM Accuracy Score ->  86.04651162790698


In [12]:
# save the model
# https://stackabuse.com/text-classification-with-python-and-scikit-learn/
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(SVM,picklefile)

# to load the model use the following
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [13]:
# to predic
# y_pred2 = model.predict(X_test)
predictions_SVM = model.predict(Test_X_Tfidf)

print(confusion_matrix(Test_Y, predictions_SVM))
print(classification_report(Test_Y, predictions_SVM))
print(accuracy_score(Test_Y, predictions_SVM)) 

[[86  8]
 [10 25]]
             precision    recall  f1-score   support

          0       0.90      0.91      0.91        94
          1       0.76      0.71      0.74        35

avg / total       0.86      0.86      0.86       129

0.8604651162790697


In [14]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  72.86821705426357


In [15]:
# # https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
xgb = XGBClassifier()
xgb.fit(Train_X_Tfidf,Train_Y)
# make predictions for test data
y_pred = xgb.predict(Test_X_Tfidf)
prediction_xgboost = [round(value) for value in y_pred]
# evaluate predictions
print("xgboost Accuracy Score -> ",accuracy_score(prediction_xgboost, Test_Y)*100)
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

xgboost Accuracy Score ->  77.51937984496125


  if diff:


In [16]:
from keras.models import Sequential
from keras import layers

input_dim = Train_X_Tfidf.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

# xgb.fit(Train_X_Tfidf,Train_Y)
history = model.fit(Train_X_Tfidf,Train_Y,
                    epochs=100,
                    verbose=False,
                    validation_data=(Test_X_Tfidf, Test_Y),
                    batch_size=10)

loss, accuracy = model.evaluate(Train_X_Tfidf,Train_Y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(Test_X_Tfidf, Test_Y, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10)                7680      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 7,691
Trainable params: 7,691
Non-trainable params: 0
_________________________________________________________________
Training Accuracy: 0.9980
Testing Accuracy:  0.8217


In [18]:

# # # First XGBoost model for Pima Indians dataset
# # from numpy import loadtxt
# # from xgboost import XGBClassifier
# # from sklearn.model_selection import train_test_split
# # from sklearn.metrics import accuracy_score
# # # load data
# # dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# # # split data into X and y
# # X = dataset[:,0:8]
# # Y = dataset[:,8]
# # # split data into train and test sets
# # seed = 7
# # test_size = 0.33
# # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# # # fit model no training data
# model = XGBClassifier()
# model.fit(X_train, y_train)
# # make predictions for test data
# y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
# # evaluate predictions
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# %matplotlib inline

# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12,5))
# sns.countplot(x=bbc_text_df.category, color='green')
# plt.title('BBC text class distribution', fontsize=16)
# plt.ylabel('Class Counts', fontsize=16)
# plt.xlabel('Class Label', fontsize=16)
# plt.xticks(rotation='vertical');

In [45]:
# Step - a : Remove blank rows if any.
Corpus['comment'].dropna(inplace=True)

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['comment'] = [entry.lower() for entry in Corpus['comment']]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['comment']= [word_tokenize(entry) for entry in Corpus['comment']]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['comment']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)
Final_words

AttributeError: 'list' object has no attribute 'lower'

In [65]:
from tkinter import *
 
window = Tk()
 
window.title("Welcome to LikeGeeks app")
 
window.geometry('350x200')
 
lbl = Label(window, text="Hello")
 
lbl.grid(column=0, row=0)
 
txt = Entry(window,width=10)
# txt.pack()
# txt.focus_set()
# insertedtext = txt.get()

txt.grid(column=1, row=0)
 
def clicked():
    text = txt.get()
    lower = text.lower()
    words= []
    remove_num = re.sub(r'\d+', '', lower)
# punch = remove_num.translate(string.maketrans("",""), string.punctuation)
    space = remove_num.strip()
    tokens = word_tokenize(space)
    stop_words = set(stopwords.words('english'))
    result = [i for i in tokens if not i in stop_words]
    lemmatizer=WordNetLemmatizer()
#     l=lemmatizer.lemmatize([word for word in result])
    for word in result:
        words.append(lemmatizer.lemmatize(word))
    #Define class functions
    
    Test_X_Tfidf = Tfidf_vect.transform(words)
#      = txt.get()
    print(Test_X_Tfidf)
    out = SVM.predict(Test_X_Tfidf)
#     print(out[0])
#     if out[0] == 0:
#         lbl.configure(text="Suggestion is a NO")
#     else:
#         lbl.configure(text="Suggestion is a YES")
    
# btn = Button(window, text="Click Me", command=clicked)

# btn.grid(column=2, row=0)
# # c
# window.mainloop()