In [1]:
###### Import fundamentals
import numpy as np
import pandas as pd
import seaborn as sns
import re

# Import nltk and download punkt, wordnet
import nltk

# Import word_tokenize and stopwords from nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer 
from nltk.tag import pos_tag


# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.model_selection import cross_val_score

# I will keep the resulting plots
%matplotlib inline

# Enable Jupyter Notebook's intellisense
%config IPCompleter.greedy=True

# We want to see whole content (non-truncated)
pd.set_option('display.max_colwidth', -1)

In [2]:
# Load the student Feeback On Services LMS
lms = pd.read_csv("studentFeebackOnServicesLMS_wlabel.csv")

# Load the student Feedback Courses
course = pd.read_csv("studentFeedbackCourses_wlabel.csv")

# Print the first five rows
display(lms.head())
display(course.head())

Unnamed: 0.1,Unnamed: 0,Content,Processed,Polarity,Subjectivity,Label
0,0,Display is excellent and camera is as good as any from that year.,"['display', 'excellent', 'camera', 'good', 'year']",0.85,0.8,Positive
1,1,Battery life is also great!,"['battery', 'life', 'also', 'great']",0.8,0.75,Positive
2,2,Protects the phone on all sides.,"['protects', 'phone', 'sides']",0.0,0.0,Neutral
3,3,"Clear Skype Calls, Long Battery Life, Long Range.","['clear', 'skype', 'calls', 'long', 'battery', 'life', 'long', 'range']",4.625929e-18,0.394444,Positive
4,4,Great Hands Free Device.,"['great', 'hands', 'free', 'device']",0.6,0.775,Positive


Unnamed: 0.1,Unnamed: 0,Content,Processed,Polarity,Subjectivity,Label
0,0,good and interesting,"['good', 'interesting']",0.6,0.55,Positive
1,1,"This class is very helpful to me. Currently, I'm still learning this class which makes up a lot of basic music knowledge.","['class', 'helpful', 'currently', 'still', 'learning', 'class', 'makes', 'lot', 'basic', 'music', 'knowledge']",0.0,0.2625,Neutral
2,2,like!Prof and TAs are helpful and the discussion among students are quite active. Very rewarding learning experience!,"['like', 'prof', 'tas', 'helpful', 'discussion', 'among', 'students', 'quite', 'active', 'rewarding', 'learning', 'experience']",0.183333,0.8,Positive
3,3,Easy to follow and includes a lot basic and important techniques to use sketchup.,"['easy', 'follow', 'includes', 'lot', 'basic', 'important', 'techniques', 'use', 'sketchup']",0.277778,0.652778,Positive
4,4,Really nice teacher!I could got the point eazliy but the v,"['really', 'nice', 'teacher', 'could', 'got', 'point', 'eazliy']",0.6,1.0,Positive


In [3]:
# Lemmatization & Stemming according to POS tagging
def NormalizeWithPOS(text):

    word_list = word_tokenize(text)
    rev = []
    lemmatizer = WordNetLemmatizer() 
    stemmer = PorterStemmer() 
    for word, tag in pos_tag(word_list):
        if tag.startswith('J'):
            w = lemmatizer.lemmatize(word, pos='a')
        elif tag.startswith('V'):
            w = lemmatizer.lemmatize(word, pos='v')
        elif tag.startswith('N'):
            w = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('R'):
            w = lemmatizer.lemmatize(word, pos='r')
        else:
            w = word
        #w = stemmer.stem(w)
        rev.append(w)
    feedback = ' '.join(rev)
    return feedback

# Call the function NormalizeWithPOS to Lemmatize & Stem the feedbacks
lms["Processed"] = lms["Processed"].apply(NormalizeWithPOS)
course["Processed"] = course["Processed"].apply(NormalizeWithPOS)

display(lms[["Processed"]].head(15))
display(course[["Processed"]].head(15))

Unnamed: 0,Processed
0,"[ 'display ' , 'excellent ' , 'camera ' , 'good ' , 'year ' ]"
1,"[ 'battery ' , 'life ' , 'also ' , 'great ' ]"
2,"[ 'protects ' , 'phone ' , 'sides ' ]"
3,"[ 'clear ' , 'skype ' , 'calls ' , 'long ' , 'battery ' , 'life ' , 'long ' , 'range ' ]"
4,"[ 'great ' , 'hands ' , 'free ' , 'device ' ]"
5,"[ 'even ' , 'take ' , 'self ' , 'portraits ' , 'outside ' , 'exterior ' , 'display ' , 'cool ' ]"
6,"[ 'trying ' , 'many ' , 'many ' , 'handsfree ' , 'gadgets ' , 'one ' , 'finally ' , 'works ' , 'well ' ]"
7,"[ 'magical ' , 'help ' ]"
8,"[ 'best ' , 'phone ' , 'market ' ]"
9,"[ 'worked ' , 'well ' ]"


Unnamed: 0,Processed
0,"[ 'good ' , 'interesting ' ]"
1,"[ 'class ' , 'helpful ' , 'currently ' , 'still ' , 'learning ' , 'class ' , 'makes ' , 'lot ' , 'basic ' , 'music ' , 'knowledge ' ]"
2,"[ 'like ' , 'prof ' , 'tas ' , 'helpful ' , 'discussion ' , 'among ' , 'students ' , 'quite ' , 'active ' , 'rewarding ' , 'learning ' , 'experience ' ]"
3,"[ 'easy ' , 'follow ' , 'includes ' , 'lot ' , 'basic ' , 'important ' , 'techniques ' , 'use ' , 'sketchup ' ]"
4,"[ 'really ' , 'nice ' , 'teacher ' , 'could ' , 'got ' , 'point ' , 'eazliy ' ]"
5,"[ 'great ' , 'course ' , 'recommend ' , 'especially ' , 'business ' , 'managers ' ]"
6,"[ 'one ' , 'useful ' , 'course ' , 'management ' ]"
7,"[ 'disappointed ' , 'name ' , 'misleading ' , 'course ' , 'provides ' , 'good ' , 'introduction ' , 'overview ' , 'responsibilities ' , 'cto ' , 'little ' , 'specifically ' , 'digital ' , 'content ' , 'deals ' , 'single ' , 'short ' , 'lecture ' , 'course ' , 'treatment ' , 'superficial ' , 'easy ' , 'find ' , 'material ' , 'freely ' , 'available ' , 'mckinsey ' , 'website ' , 'example ' ]"
8,"[ 'super ' , 'content ' , 'definitely ' , 'course ' ]"
9,"[ 'one ' , 'excellent ' , 'courses ' , 'coursera ' , 'information ' , 'technology ' , 'bosses ' , 'managers ' ]"


In [4]:
#english stop words
enstopwords = set(stopwords.words('english'))

# Initialize a Tf-idf Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, max_df=0.90, min_df=2, stop_words=enstopwords)
vectorizer1 = TfidfVectorizer(max_features=5000, max_df=0.90, min_df=2, stop_words=enstopwords)

# Fit and transform the vectorizer corpus = [str (item) for item in corpus]
tfidf_matrix_lms= vectorizer.fit_transform(str (item) for item in lms["Processed"])
tfidf_matrix_course= vectorizer1.fit_transform(str (item) for item in course["Processed"])

# Let's see what we have
tfidf_matrix_lms
tfidf_matrix_course

# Create a DataFrame for tf-idf vectors and display the first five rows
tfidf_df_lms = pd.DataFrame(tfidf_matrix_lms.toarray(), columns= vectorizer.get_feature_names())
tfidf_df_course = pd.DataFrame(tfidf_matrix_course.toarray(), columns= vectorizer1.get_feature_names())

display(tfidf_df_lms.head())
display(tfidf_df_course.head())



Unnamed: 0,ability,able,abroad,absolutely,abt,abysmal,accept,access,accessable,accitivties,...,written,wrong,yeah,year,years,yes,yet,young,yummy,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.483805,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,abandoned,abilities,ability,able,ableton,absence,absolute,absolutely,absorb,absorbed,...,years,yes,yet,yonsei,young,youtube,zelikow,zero,zombies,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Select the features and the target

#for student Feeback On Services LMS
X_lms = tfidf_matrix_lms
y_lms = lms["Label"]

#for student Feedback Courses
X_course = tfidf_matrix_course
y_course = course["Label"]

In [6]:
#split the data into training and testing

#for student Feeback On Services LMS
X_lms_train, X_lms_test, y_lms_train, y_lms_test = train_test_split(X_lms, y_lms, random_state=42, test_size = .20)


#for student Feedback Courses
X_course_train, X_course_test, y_course_train, y_course_test = train_test_split(X_course, y_course, random_state=42, test_size = .20)

In [7]:
#function for confusion matrix
def confussionMatrix(cl,X_test,y_test):
    # Predict the labels
    y_pred = cl.predict(X_test)
    
    # Print the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix\n")
    print(cm)
    
    # Print the Classification Report
    cr = classification_report(y_test, y_pred)
    print("\n\nClassification Report\n")
    print(cr)
    
    return

In [8]:
#1 Multinomial Naive Bayes
mnb_lms = MultinomialNB()
mnb_course = MultinomialNB()

#train our algorithm
mnb_lms.fit(X_lms_train, y_lms_train)
mnb_course.fit(X_course_train, y_course_train)

#Test the trained classifier
predicted_class_lms = mnb_lms.predict(X_lms_test)
predicted_class_course = mnb_course.predict(X_course_test)

#call the function confussionMatrix to compute the accuracy of the model
#confussionMatrix(mnb_lms,X_lms_test,y_lms_test)
#confussionMatrix(mnb_course,X_course_test,y_course_test)

print('Accuracy of Gaussian Naive Bayes for this dataset: %3.2f' %  accuracy_score(y_lms_test, predicted_class_lms))
print('Accuracy of Gaussian Naive Bayes for this dataset: %3.2f' %  accuracy_score(y_course_test, predicted_class_course))

Accuracy of Gaussian Naive Bayes for this dataset: 0.74
Accuracy of Gaussian Naive Bayes for this dataset: 0.88


In [9]:
#2 Complement Naive Bayes
cnb_lms = ComplementNB()
cnb_course = ComplementNB()

#train our algorithm
cnb_lms.fit(X_lms_train, y_lms_train)
cnb_course.fit(X_course_train, y_course_train)

#Test the trained classifier
predicted_class_lms = cnb_lms.predict(X_lms_test)
predicted_class_course = cnb_course.predict(X_course_test)

#call the function confussionMatrix to compute the accuracy of the model
confussionMatrix(cnb_lms,X_lms_test,y_lms_test)
confussionMatrix(cnb_course,X_course_test,y_course_test)

Confusion Matrix

[[123  16   7]
 [ 26 103  43]
 [ 28  39 338]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.69      0.84      0.76       146
     Neutral       0.65      0.60      0.62       172
    Positive       0.87      0.83      0.85       405

    accuracy                           0.78       723
   macro avg       0.74      0.76      0.75       723
weighted avg       0.78      0.78      0.78       723

Confusion Matrix

[[  788    67    91]
 [  257   873   373]
 [ 1336   586 15636]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.33      0.83      0.47       946
     Neutral       0.57      0.58      0.58      1503
    Positive       0.97      0.89      0.93     17558

    accuracy                           0.86     20007
   macro avg       0.62      0.77      0.66     20007
weighted avg       0.91      0.86      0.88     20007



In [10]:
#3 Bernoulli Naive Bayes classifier
nb_lms = BernoulliNB()
nb_course = BernoulliNB()

#train our algorithm
nb_lms.fit(X_lms_train, y_lms_train)
nb_course.fit(X_course_train, y_course_train)

#Test the trained classifier
predicted_class_lms = nb_lms.predict(X_lms_test)
predicted_class_course = nb_course.predict(X_course_test)

#call the function confussionMatrix to compute the accuracy of the model
confussionMatrix(nb_lms,X_lms_test,y_lms_test)
confussionMatrix(nb_course,X_course_test,y_course_test)

Confusion Matrix

[[ 92  24  30]
 [  9  88  75]
 [ 19  26 360]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.77      0.63      0.69       146
     Neutral       0.64      0.51      0.57       172
    Positive       0.77      0.89      0.83       405

    accuracy                           0.75       723
   macro avg       0.73      0.68      0.70       723
weighted avg       0.74      0.75      0.74       723

Confusion Matrix

[[  392   133   421]
 [   31  1196   276]
 [ 1042   603 15913]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.27      0.41      0.33       946
     Neutral       0.62      0.80      0.70      1503
    Positive       0.96      0.91      0.93     17558

    accuracy                           0.87     20007
   macro avg       0.61      0.71      0.65     20007
weighted avg       0.90      0.87      0.89     20007



In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# logistic regression works with binary class by default, so multi_class='multinomial' must be used for multiclass
lg_lms = LogisticRegression(multi_class='multinomial', solver='newton-cg')
lg_course = LogisticRegression(multi_class='multinomial', solver='newton-cg')
    
#train our algorithm
lg_lms.fit(X_lms_train, y_lms_train)
lg_course.fit(X_course_train, y_course_train)

#Test the trained classifier
predicted_class_lms = lg_lms.predict(X_lms_test)
predicted_class_course = lg_course.predict(X_course_test)

print('Accuracy of LR for this dataset: %3.2f' %  accuracy_score(y_lms_test, predicted_class_lms))
print('Accuracy of LR for this dataset: %3.2f' %  accuracy_score(y_course_test, predicted_class_course))

Accuracy of LR for this dataset: 0.84
Accuracy of LR for this dataset: 0.97


In [14]:
import pickle
pickle.dump(mnb_lms, open("MNB_lms_model.pkl", 'wb'))
pickle.dump(cnb_lms, open("CNB_lms_model.pkl", 'wb'))
pickle.dump(nb_lms, open("BNB_lms_model.pkl", 'wb'))
pickle.dump(mnb_course, open("MNB_course_model.pkl", 'wb'))
pickle.dump(cnb_course, open("CNB_course_model.pkl", 'wb'))
pickle.dump(nb_course, open("BNB_course_model.pkl", 'wb'))
pickle.dump(lg_lms, open("lg_lms_model.pkl", 'wb'))
pickle.dump(lg_course, open("lg_course_model.pkl", 'wb'))
pickle.dump(vectorizer, open("vectorizer_lms.pkl", 'wb'))
pickle.dump(vectorizer1, open("vectorizer_course.pkl", 'wb'))