In [1]:
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
# import rb
import pickle

In [2]:
np.random.seed(5000)

# Load the data

In [3]:
Corpus = pd.read_csv(r"data/student_evaluation.csv",encoding='latin-1')

# Pre processing

In [6]:
# Step - a : Remove blank rows if any.
Corpus['comment'].dropna(inplace=True)

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['comment'] = [entry.lower() for entry in Corpus['comment']]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['comment']= [word_tokenize(entry) for entry in Corpus['comment']]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['comment']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

AttributeError: 'list' object has no attribute 'lower'

In [5]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['suggestion'],test_size=0.2)

In [None]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
# Test_X.describe

## term frequency calulation

In [None]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
# type(Test_X_Tfidf)

In [None]:
# print(Train_X_Tfidf)

##  accuracy using pipeline Multinomial Naive Bayes

In [None]:
# pipeline model

text_clf = Pipeline([('vect', TfidfVectorizer()), 
                      ('clf', MultinomialNB()) ])
# train the model
text_clf.fit(Train_X,Train_Y)

# Predict the test cases
predicted = text_clf.predict(Test_X)

from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np

print('Accuracy achieved is ' + str(np.mean(predicted == Test_Y)))
# print(metrics.classification_report(Test_X.comment, predicted, target_names=Test_Y.suggestion)),
# metrics.confusion_matrix(Test_X.comment, predicted)

## predefined input Multinomial Naive Bayes

In [None]:
# pipeline shortens the steps 
# https://www.youtube.com/watch?v=0kPRaYSgblM
# https://github.com/codewrestling/TextClassification/blob/master/Text%20Classification.py

text_clf = Pipeline([('vect', TfidfVectorizer()), 
                      ('clf', MultinomialNB()) ])
# train the model
text_clf.fit(Train_X,Train_Y)

# Predict the test cases
test_x = ['Few more application specific contents is needed. ','Good course.', 'Please include Field trips.','I think an in class mini-project should be considered to be done step by step in during lectures.']
# test_x = [input()]
predicted = text_clf.predict(test_x)
for x in predicted:
    print(x)
    


## Compile time Input Multinomial Naive Bayes

In [None]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)


# docs = ['Also the project helped us implement real world applications that is further helpful if we plan to do research in this field.']
# docs = ['Few more application specific contents is needed. ','Good course.', 'Please include Field trips.','I think an in class mini-project should be considered to be done step by step in during lectures.']
docs = [input()]
# docs = input()
x_new_tfidf = Tfidf_vect.transform(docs)
predictions = Naive.predict(x_new_tfidf)

for x in predictions:
    print(x)
    
    

In [None]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
  
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)


# predictions_NB
# Test_X_Tfidf


In [None]:
confusion_matrix_NB = confusion_matrix(Test_Y, predictions_NB)

classes = ["No", "Yes"]

plt.figure()
plt.imshow(confusion_matrix_NB, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix - Text Clasiffication")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

text_format = 'd'
thresh = confusion_matrix_NB.max() / 2.
for row, column in itertools.product(range(confusion_matrix_NB.shape[0]), range(confusion_matrix_NB.shape[1])):
    plt.text(column, row, format(confusion_matrix_NB[row, column], text_format),
             horizontalalignment="center",
             color="white" if confusion_matrix_NB[row, column] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()

plt.show()

In [None]:
len(Train_Y)
len(Test_Y)


## Bernoulli Naive Bayes

In [None]:

# fit the training dataset on the NB classifier
BNaive = naive_bayes.BernoulliNB()
BNaive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
Bpredictions_NB = BNaive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(Bpredictions_NB, Test_Y)*100)



# SVM classification


In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)



In [None]:
text_clf_SVM= Pipeline([('vect', TfidfVectorizer()), 
                      ('clf', SVC(C=1.0, kernel='linear', degree=3, gamma='auto')) ])
# train the model
text_clf_SVM.fit(Train_X,Train_Y)

# Predict the test cases
# test_x = ['Few more application specific contents is needed. ','Good course.', 'Please include Field trips.','I think an in class mini-project should be considered to be done step by step in during lectures.']
test_x = [input()]

predicted_svm = text_clf_SVM.predict(test_x)
for x in predicted_svm:
    print(x)

# print("SVM Accuracy Score -> ",accuracy_score(predicted_svm, Test_Y)*100)


# save the model and load the model

In [None]:
# save the model
# https://stackabuse.com/text-classification-with-python-and-scikit-learn/
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(SVM,picklefile)

# to load the model use the following
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [None]:
# # to predic
# # y_pred2 = model.predict(X_test)
# predictions_SVM = model.predict(Test_X_Tfidf)

# print(confusion_matrix(Test_Y, predictions_SVM))
# print(classification_report(Test_Y, predictions_SVM))
# print(accuracy_score(Test_Y, predictions_SVM)) 

In [None]:
# to predic
# y_pred2 = model.predict(X_test)
# predictions_SVM = model.predict(Test_X_Tfidf)
predictions_SVM = SVM.predict(Test_X_Tfidf)
confusion_matrix_result = confusion_matrix(Test_Y, predictions_SVM)
print(confusion_matrix_result)
print(classification_report(Test_Y, predictions_SVM))
print(accuracy_score(Test_Y, predictions_SVM)) 

In [None]:
classes = ["No", "Yes"]

plt.figure()
plt.imshow(confusion_matrix_result, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix - Text Clasiffication")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

text_format = 'd'
thresh = confusion_matrix_result.max() / 2.
for row, column in itertools.product(range(confusion_matrix_result.shape[0]), range(confusion_matrix_result.shape[1])):
    plt.text(column, row, format(confusion_matrix_result[row, column], text_format),
             horizontalalignment="center",
             color="white" if confusion_matrix_result[row, column] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()

plt.show()

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

# XGBoost

In [None]:
# # https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
xgb = XGBClassifier()
xgb.fit(Train_X_Tfidf,Train_Y)
# make predictions for test data
y_pred = xgb.predict(Test_X_Tfidf)
prediction_xgboost = [round(value) for value in y_pred]
# evaluate predictions
print("xgboost Accuracy Score -> ",accuracy_score(prediction_xgboost, Test_Y)*100)
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Deep Learning

In [None]:
from keras.models import Sequential
from keras import layers

input_dim = Train_X_Tfidf.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

# xgb.fit(Train_X_Tfidf,Train_Y)
history = model.fit(Train_X_Tfidf,Train_Y,
                    epochs=100,
                    verbose=False,
                    validation_data=(Test_X_Tfidf, Test_Y),
                    batch_size=10)

loss, accuracy = model.evaluate(Train_X_Tfidf,Train_Y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(Test_X_Tfidf, Test_Y, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


In [None]:

# # # First XGBoost model for Pima Indians dataset
# # from numpy import loadtxt
# # from xgboost import XGBClassifier
# # from sklearn.model_selection import train_test_split
# # from sklearn.metrics import accuracy_score
# # # load data
# # dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# # # split data into X and y
# # X = dataset[:,0:8]
# # Y = dataset[:,8]
# # # split data into train and test sets
# # seed = 7
# # test_size = 0.33
# # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# # # fit model no training data
# model = XGBClassifier()
# model.fit(X_train, y_train)
# # make predictions for test data
# y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
# # evaluate predictions
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# %matplotlib inline

# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12,5))
# sns.countplot(x=bbc_text_df.category, color='green')
# plt.title('BBC text class distribution', fontsize=16)
# plt.ylabel('Class Counts', fontsize=16)
# plt.xlabel('Class Label', fontsize=16)
# plt.xticks(rotation='vertical');

In [None]:
# # Step - a : Remove blank rows if any.
# Corpus['comment'].dropna(inplace=True)

# # Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
# Corpus['comment'] = [entry.lower() for entry in Corpus['comment']]

# # Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
# Corpus['comment']= [word_tokenize(entry) for entry in Corpus['comment']]

# # Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
# tag_map = defaultdict(lambda : wn.NOUN)
# tag_map['J'] = wn.ADJ
# tag_map['V'] = wn.VERB
# tag_map['R'] = wn.ADV
# for index,entry in enumerate(Corpus['comment']):
#     # Declaring Empty List to store the words that follow the rules for this step
#     Final_words = []
#     # Initializing WordNetLemmatizer()
#     word_Lemmatized = WordNetLemmatizer()
#     # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
#     for word, tag in pos_tag(entry):
#         # Below condition is to check for Stop words and consider only alphabets
#         if word not in stopwords.words('english') and word.isalpha():
#             word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
#             Final_words.append(word_Final)
#     # The final processed set of words for each iteration will be stored in 'text_final'
#     Corpus.loc[index,'text_final'] = str(Final_words)
# Final_words

In [None]:
from tkinter import *
 
window = Tk()
 
window.title("Welcome to LikeGeeks app")
 
window.geometry('350x200')
 
lbl = Label(window, text="Hello")
 
lbl.grid(column=0, row=0)
 
txt = Entry(window,width=10)
# txt.pack()
# txt.focus_set()
# insertedtext = txt.get()

txt.grid(column=1, row=0)
 
def clicked():
    text = txt.get()
    lower = text.lower()
    words= []
    remove_num = re.sub(r'\d+', '', lower)
    print(remove_num,' \n\n')
# punch = remove_num.translate(string.maketrans("",""), string.punctuation)
    space = remove_num.strip()
    print(space,' \n\n')
    tokens = word_tokenize(space)
    print(tokens,' \n\n')
    stop_words = set(stopwords.words('english'))
    print(stop_words,' \n\n')
    result = [i for i in tokens if not i in stop_words]
    print(result,' \n\n')
    lemmatizer=WordNetLemmatizer()
#     l=lemmatizer.lemmatize([word for word in result])
    for word in result:
        words.append(lemmatizer.lemmatize(word))
    #Define class functions
    print(words,' \n\n')
    Test_X_Tfidf = Tfidf_vect.transform(words)
#      = txt.get()
    
    out = model.predict(Test_X_Tfidflol)
    print(Test_X_Tfidflol)
#     print(out[1])
    
    if out[0] == 0:
        lbl.configure(text="Suggestion is a NO")
    else:
        lbl.configure(text="Suggestion is a YES")
    
btn = Button(window, text="Click Me", command=clicked)

btn.grid(column=2, row=0)
# c
window.mainloop()