In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings 
import re 
import sys
import string 
import math
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)


data=pd.read_csv(r"C:\Users\utkarsh.sharma\Desktop\Bloom's taxonomy\data1.1.csv")
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   txt       141 non-null    object
 1   category  141 non-null    object
dtypes: object(2)
memory usage: 2.3+ KB


Unnamed: 0,txt,category
0,Interpret the graph and state how many trees w...,Application
1,Justify the concept of inheritance and give th...,Evaluation
2,Create several different strategies to solve a...,Synthesis
3,What is the definition of the following terms...,Knowledge
4,Can you justify the decisions you have made ?,Evaluation


In [2]:
print(data.describe())
data['category']=data['category'].astype('category')
print(data.columns)

                                                  txt   category
count                                             141        141
unique                                            141          6
top     What distinctions can be made about...and...?  Synthesis
freq                                                1         30
Index(['txt', 'category'], dtype='object')


In [3]:
#Pre-processing 

#Expanding contractions is not requirement here, but would have to be added eventually 

#Case Conversion 
data['txt']=data['txt'].str.lower()
#Punctuation removal 
data['txt'] = data['txt'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))
#Removing objects in the given list that have numbers and digits
data['txt'] = data['txt'].str.replace('\d+', '')

#Selected stop word removal
#converting number to text 

In [4]:
#Lemmatization

#lemmatization would have to be done along with POS tags at a latter stage
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
data['txt'] = data['txt'].apply(lambda text: lemmatize_words(text))

#Removing non-english words

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus =data['txt']

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X)

['about', 'aboutand', 'above', 'abstract', 'accept', 'according', 'action', 'add', 'address', 'adopt', 'advertising', 'advice', 'advising', 'affect', 'after', 'against', 'air', 'all', 'along', 'alpha', 'alphaint', 'alphaj', 'alternative', 'an', 'analysis', 'analyze', 'and', 'animal', 'another', 'answer', 'apply', 'appropriate', 'argument', 'array', 'arraylist', 'art', 'ass', 'astrology', 'atomic', 'attribute', 'author', 'bad', 'based', 'be', 'been', 'below', 'best', 'between', 'book', 'both', 'briefly', 'building', 'but', 'calculate', 'campaign', 'can', 'case', 'categorize', 'central', 'change', 'character', 'choose', 'cite', 'city', 'class', 'classify', 'climax', 'code', 'collect', 'color', 'company', 'compare', 'component', 'con', 'concept', 'concern', 'configuration', 'construct', 'consumer', 'content', 'contrast', 'correct', 'could', 'course', 'cover', 'create', 'creates', 'criterion', 'criticism', 'current', 'cut', 'data', 'datatype', 'deal', 'decision', 'declare', 'defend', 'defe

In [6]:
#X and Y for input and outcome 
y=data['category'].values

In [7]:
#splitting dataset into training and testing 

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [8]:
#model implementation and metrics 

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
from sklearn.metrics import confusion_matrix

mb=MultinomialNB()
mb.fit(X_train,y_train)

y_setpredict=mb.predict(X_test)

accuracyScore = accuracy_score(y_test,y_setpredict)*100
print(accuracyScore)
print(f1_score(y_test,y_setpredict,average='weighted'))
print(precision_score(y_test,y_setpredict,average='weighted'))
print(recall_score(y_test,y_setpredict,average='weighted'))

75.86206896551724
0.7239811912225704
0.7417972831765935
0.7586206896551724


In [9]:
#Logistic regression 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

log_params={'penalty':['l1','l2'],'C':[0.0001,0.001,0.01,0.1,1,10,100],'solver':['liblinear','saga']}
log_model=GridSearchCV(LogisticRegression(),log_params,cv=5)
log_model.fit(X_train,y_train)
log_predict=log_model.predict(X_test)
log_cm=confusion_matrix(y_test,log_predict)
log_score=log_model.best_score_
print(log_cm)
print(log_score)

[[3 0 0 0 0 0]
 [0 1 0 0 0 1]
 [1 0 3 0 1 1]
 [0 1 1 3 0 0]
 [1 0 0 0 3 0]
 [0 0 0 1 0 8]]
0.6782608695652173


In [18]:
#KNN

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

knn_params={'n_neighbors':[5,7,9,12],'weights':['uniform','distance'],'algorithm':['ball_tree','kd_tree','brute'],'metric':['minkowski','manhattan','euclidean']}
knn_model=GridSearchCV(KNeighborsClassifier(),knn_params,cv=5)
knn_model.fit(X_train,y_train)
knn_predict=log_model.predict(X_test)
knn_cm=confusion_matrix(y_test,log_predict)
knn_score=log_model.best_score_
print(knn_cm)
print(knn_score)

[[3 0 0 0 0 0]
 [0 1 0 0 0 1]
 [1 0 3 0 1 1]
 [0 1 1 3 0 0]
 [1 0 0 0 3 0]
 [0 0 0 1 0 8]]
0.6782608695652173
