In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import itertools
# import catboost
# from catboost import CatBoostClassifier
# import rb

import pickle

In [2]:
np.random.seed(5000)

In [21]:
Corpus = pd.read_csv(r"data/student_evaluation.csv",encoding='latin-1')
Corpus.head()
Corpus.Sentiment.unique()

array(['positive', 'neutral'], dtype=object)

In [4]:
test = pd.read_csv(r"data/test.csv",encoding='latin-1')
test.head()
# Corpus.Sentiment.unique()

Unnamed: 0,comment,Sentiment,Actual
0,Great.,,1
1,I do not know how this course is going to help...,,0
2,all is Perfect.,,1
3,could be more interactive and practical with l...,,0
4,More examples required to get a better grip on...,,0


In [5]:
# Step - a : Remove blank rows if any.
test['comment'].dropna(inplace=True)

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
test['comment'] = [entry.lower() for entry in test['comment']]

# Step - c : Tokenization : In this each entry in the test will be broken into set of words
test['comment']= [word_tokenize(entry) for entry in test['comment']]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(test['comment']):
    # Declaring Empty List to store the words that follow the rules for this step
    final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    test.loc[index,'test_final'] = str(final_words)

# print(Final_words)

In [6]:
# Step - a : Remove blank rows if any.
Corpus['comment'].dropna(inplace=True)

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['comment'] = [entry.lower() for entry in Corpus['comment']]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['comment']= [word_tokenize(entry) for entry in Corpus['comment']]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['comment']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

# print(Final_words)

In [7]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['Sentiment'],test_size=0.2)
X_Test= test['test_final']
Corpus['Sentiment'].value_counts()
# Corpus['text_final']
# X_Test.shape()
# Train_X.describe()

neutral     335
positive    306
Name: Sentiment, dtype: int64

In [8]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
# print(Train_X,Train_Y)

In [9]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
# type(Test_X_Tfidf)
# print(Train_X_Tfidf)
# vectorizer = CountVectorizer(ngram_range=(2,2))
# Tfidf_vect.fit_transform(Corpus['text_final'])

# print(Tfidf_vect.get_feature_names())

# vectorizer = TfidfVectorizer(ngram_range=(2,2)) # You can still specify n-grams here.
# X = vectorizer.fit_transform(arr)

In [10]:
# print(Train_X_Tfidf)

# Decision Tree Pirpeline model

In [11]:
# fit the training dataset on the classifier
DT = DecisionTreeClassifier(random_state=0)
DT.fit(Train_X_Tfidf,Train_Y)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [17]:

text_clf_DT = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,3))), 
                      ('clf', DecisionTreeClassifier()) ])
text_clf_DT.fit(Train_X,Train_Y)

predictions_DT = text_clf_DT.predict(Test_X)

print("\n Decision Tree accuracy score -- > ",accuracy_score(Test_Y, predictions_DT)*100) 
# print(text_clf_DT)


 Decision Tree accuracy score -- >  71.31782945736434


# Classifcation report

In [18]:
# confusion_matrix_result = confusion_matrix(Test_Y, predictions_DT)
# print("Confusion matrix \n",confusion_matrix_result)

print("\n Classification report \n",classification_report(Test_Y, predictions_DT))
print("\n Decision Tree accuracy score -- > ",accuracy_score(Test_Y, predictions_DT)*100)  

print("\n Decision Tree precision score -- > ",precision_score(Test_Y, predictions_DT, average = 'weighted')*100)


 Classification report 
              precision    recall  f1-score   support

          0       0.75      0.61      0.67        62
          1       0.69      0.81      0.74        67

avg / total       0.72      0.71      0.71       129


 Decision Tree accuracy score -- >  71.31782945736434

 Decision Tree precision score -- >  71.76797970231623


# Saving result in CSV file

In [14]:

predicted_dt_save = text_clf_DT.predict(X_Test)

test['Sentiment'] = predicted_dt_save
submission = test[["comment","Sentiment","Actual"]]
submission.to_csv("data/results/DT_submission.csv", index = False)