In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
data = pd.read_csv("dataset_s.csv", header = None)

The attributes are as follows: <br>
1. Data Collection<br>
2. Cookies or data utilisation <br>
3. Usage of location information <br>
4. Sharing of data <br>
5. Contact info 


In [4]:
data.columns = ['raw_text', 'category']

In [5]:
data.head()

Unnamed: 0,raw_text,category
0,1. ABOUT OUR PRODUCTS 1.1 Our products offer a...,0
1,2. THE INFORMATION WE COLLECT The information ...,1
2,"2.2 In addition, we store certain information ...",2
3,(c) to remember your preferences and registrat...,0
4,(d) to present and help measure and research t...,0


In [6]:
data.describe()

Unnamed: 0,category
count,2142.0
mean,0.943978
std,1.520724
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,5.0


## Tokenize, lemmatize and stopword removal 

In [7]:
stop_words = set(stopwords.words("english"))

In [8]:
q = '2'
q.isnumeric()

True

In [9]:
import re

In [10]:
compound_words = re.compile("[a-z]+-[a-z]+")

alph_word = re.compile("[a-z]{2,}")

def preprocess(x):
    lst = []
    ss = WordNetLemmatizer()
    x = word_tokenize(x)
    for word in x:
        if word not in stop_words and (alph_word.match(word) or compound_words.match(word)):
            lst.append(ss.lemmatize(word.lower()))
    return lst

In [11]:
cleaned_text = data['raw_text'].apply(lambda x: preprocess(x))

In [12]:
data['category'].value_counts()

0    1351
1     261
2     215
4     184
5     101
3      30
Name: category, dtype: int64

In [13]:
data['cleaned_text'] = cleaned_text

## Tf-idf 

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf = TfidfVectorizer(min_df=5, ngram_range=(1,2), stop_words='english', sublinear_tf=True)

In [16]:
x = tfidf.fit_transform(data['cleaned_text'].astype(str))

In [17]:
x.toarray().shape

(2142, 2525)

## Dimensionality reduction 

In [18]:
from sklearn.decomposition import TruncatedSVD

In [19]:
dr = TruncatedSVD(n_components=100, n_iter = 10)

In [20]:
reduced_x = dr.fit_transform(x)

In [21]:
y = data['category']

## Split 

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
from keras.utils import to_categorical

categorical_labels = to_categorical(y, num_classes=None)

y = categorical_labels

Using TensorFlow backend.


In [24]:
X_train, X_test, y_train, y_test = train_test_split(reduced_x, y)

## ANN 

In [25]:
from tensorflow import keras

In [26]:
from keras.models import Sequential
from keras.layers import Dense

In [27]:
import tensorflow as tf
import os
import datetime

In [87]:
model = keras.Sequential([
    keras.layers.Dense(64, activation = 'relu', input_dim = 100),
    keras.layers.Dense(32,activation=keras.layers.LeakyReLU(alpha=0.3)),
    keras.layers.Dense(16, activation =keras.layers.LeakyReLU(alpha=0.3)),
    keras.layers.Dense(8, activation =keras.layers.LeakyReLU(alpha=0.3)),
    keras.layers.Dense(6, activation='softmax')
])


In [88]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

In [89]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [90]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [91]:
y_train = y_train.reshape(-1,6)

In [92]:
model.fit(X_train, y_train, epochs = 20, batch_size=32,validation_data=(X_test, y_test), callbacks=[tensorboard_callback])

Train on 1606 samples, validate on 536 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x162143630>

In [34]:
preds = model.predict(X_test)

In [35]:
preds

array([[9.9973530e-01, 7.5619901e-06, 1.8003471e-06, 1.1140250e-05,
        4.3580538e-05, 2.0073737e-04],
       [9.9991930e-01, 2.8864881e-06, 3.0712343e-07, 3.1583850e-06,
        1.4060405e-06, 7.2896502e-05],
       [9.1795081e-01, 2.6875023e-02, 5.6566815e-03, 1.6119304e-03,
        1.5435812e-02, 3.2469768e-02],
       ...,
       [9.9926215e-01, 3.3248466e-04, 1.8305474e-04, 2.0024970e-06,
        7.2611940e-05, 1.4759955e-04],
       [9.8311090e-01, 4.6112710e-03, 1.2084924e-03, 5.5818359e-04,
        2.4696307e-03, 8.0414135e-03],
       [1.3150888e-02, 9.1281646e-01, 5.2526403e-02, 8.8036541e-05,
        2.9721402e-03, 1.8446054e-02]], dtype=float32)

In [94]:
results= []
for vec in preds:
    vec = list(vec)
    results.append(vec.index(max(vec)))

In [95]:
from sklearn.metrics import accuracy_score, classification_report

In [96]:
y_true= []
for vec in y_test:
    vec = list(vec)
    y_true.append(vec.index(max(vec)))

In [97]:
print(accuracy_score(y_true, results))

0.6940298507462687


In [40]:
print(classification_report(y_true, results))

              precision    recall  f1-score   support

           0       0.80      0.81      0.80       342
           1       0.52      0.33      0.41        72
           2       0.46      0.56      0.51        45
           3       0.67      0.29      0.40         7
           4       0.51      0.69      0.58        45
           5       0.52      0.48      0.50        25

    accuracy                           0.69       536
   macro avg       0.58      0.53      0.53       536
weighted avg       0.69      0.69      0.69       536



In [41]:
results

[0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 1,
 4,
 0,
 0,
 0,
 4,
 0,
 0,
 2,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 2,
 0,
 5,
 0,
 0,
 0,
 4,
 0,
 0,
 1,
 4,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 4,
 1,
 0,
 2,
 0,
 2,
 1,
 0,
 2,
 2,
 0,
 0,
 0,
 4,
 4,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 5,
 4,
 4,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 1,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 5,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 2,
 5,
 0,
 0,
 1,
 2,
 1,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 0,
 4,
 1,
 4,
 4,
 0,
 0,
 0,
 2,
 2,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 4,
 0,
 0,
 0,
 0,
 2,
 0,
 1,
 0,
 4,
 5,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 5,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 5,
 1,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 4,
 0,
 0,
 2,
 4,
 0,
 4,
 4,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 1,
 4,
 2,
 0,
 0,
 0,
 1,
 0,
 4,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
