In [1]:
# run this code when running the code on Google Colab
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.insert(0,'/content/drive/MyDrive/Applied_ML_Project/')

Mounted at /content/drive


In [2]:
# importing general packages
import numpy as np
import pandas as pd

# importing packages for text pre-porcessing
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup # package used for web scrapping to remove the HTML tags from the text ( not needed here)
import re # A package dealing with regular experession to remove punctuation and numbers
from nltk.stem import PorterStemmer
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet

# importing packages for vectorization of text data
from sklearn.feature_extraction.text import CountVectorizer # bag of words vectorization
from sklearn.feature_extraction.text import TfidfVectorizer # Tf - idf vectorization
from sklearn.preprocessing import LabelEncoder

#importing packages for model-building
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,StratifiedKFold,LeaveOneOut
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import itertools
import matplotlib.pyplot as plt

# importing packages for sentiment analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#hiding all warnings
import warnings
warnings.filterwarnings('ignore')

import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [3]:
train_data_path = "/content/drive/MyDrive/Applied_ML_Project/data/train.csv"
val_data_path = "/content/drive/MyDrive/Applied_ML_Project/data/test.csv"
val_label_path = "/content/drive/MyDrive/Applied_ML_Project/data/test_labels.csv"
train_policy_data = pd.read_csv(train_data_path)
train_policy_data.head()

Unnamed: 0,index,Policy_text,Category_Data Retention,Category_Data Security,Category_Do Not Track,Category_First Party Collection/Use,Category_International and Specific Audiences,Category_Other,Category_Policy Change,Category_Third Party Sharing/Collection,"Category_User Access, Edit and Deletion",Category_User Choice/Control
0,14651,<strong> 1. What is the personal information t...,0,0,0,1,0,0,0,0,0,0
1,22227,Appointment Information to schedule an appoint...,0,0,0,1,0,0,0,0,0,0
2,176,Business Transitions <br> <br> In the event Re...,0,0,0,0,0,0,0,1,0,0
3,16818,b. Cookies. <br> <br> (i) Hearst (or third par...,0,0,0,0,0,0,0,1,0,0
4,20169,II. Sharing of information. <br> <br> We may s...,0,0,0,0,0,0,0,1,0,0


In [4]:
val_policy_data = pd.read_csv(val_data_path)
val_policy_data.head()

Unnamed: 0,index,Policy_text,Category_Data Retention,Category_Data Security,Category_Do Not Track,Category_First Party Collection/Use,Category_International and Specific Audiences,Category_Other,Category_Policy Change,Category_Third Party Sharing/Collection,"Category_User Access, Edit and Deletion",Category_User Choice/Control
0,4291,<ul> <li> <strong> Opening an Account or Using...,0,0,0,0,0,0,0,1,0,0
1,15427,We will not sell or rent the personally identi...,0,0,0,0,0,0,0,1,0,0
2,23013,<ul> <li> further our business or operational ...,0,0,0,1,0,0,0,0,0,0
3,18471,<strong> Collection of Information From Other ...,0,0,0,1,0,0,0,0,0,0
4,13879,<ul> <li> If you have created an account on on...,0,0,0,0,0,0,0,0,1,0


In [5]:
val_label_data = pd.read_csv(val_label_path)
val_label_data.head()

Unnamed: 0,ID,Category_Data Retention,Category_Data Security,Category_Do Not Track,Category_First Party Collection/Use,Category_International and Specific Audiences,Category_Other,Category_Policy Change,Category_Third Party Sharing/Collection,"Category_User Access, Edit and Deletion",Category_User Choice/Control
0,25593,0,1,0,0,0,0,0,0,0,0
1,3040,0,0,0,1,0,0,0,0,0,0
2,17764,0,0,0,0,0,0,0,0,1,0
3,24170,0,0,0,1,0,0,0,0,0,0
4,13562,0,0,0,1,0,0,0,0,0,0


Text processing

In [6]:
# function to clean data

# initializing stemming algorithm
ps = PorterStemmer() # porter stemmer
ss = SnowballStemmer('english') # snowball stemmer

# initializing lemmatizing algorithm
wnl = WordNetLemmatizer() # lemmatization
def text_clean_preprocess(raw_text : str):
    """
    This function will clean the data and perform certain preprocessing step of stemming/lemmatizing the words in 
    the tweet. Finally the function will remove stopwords and will only consider with words with a character length
    ranging from 2 to 10
    """
    vowel = ['a','e','i','o','u']
    text = BeautifulSoup(raw_text) # using the BeautifulSoup library to scrape the HTML tags from the text
    text = re.sub("[^a-zA-Z0-9]"," ",text.get_text()) # removing all the punctions except the expressions ":)" and ":("
    text = text.lower() # converting all words to lowercase
    text = text.strip() # striping leading and trailing white spaces
    words = text.split()
    meaningful_words = [wnl.lemmatize(w) for w in words if w not in stopwords.words("english")and w[:4] != "http"]
    return " ".join(meaningful_words)  

Text Processing on Train Data

In [7]:
clean_policy_text = []
for text in train_policy_data['Policy_text']:
  cleaned_text = text_clean_preprocess(text)
  clean_policy_text.append(cleaned_text)
train_policy_data['Cleaned_Policy_text'] = clean_policy_text

Text Processing on Test Data

In [8]:
val_clean_policy_text = []
for val_text in val_policy_data['Policy_text']:
  val_cleaned_text = text_clean_preprocess(val_text)
  val_clean_policy_text.append(val_cleaned_text)
val_policy_data['Cleaned_Policy_text'] = val_clean_policy_text

In [9]:
train_policy_data.head()

Unnamed: 0,index,Policy_text,Category_Data Retention,Category_Data Security,Category_Do Not Track,Category_First Party Collection/Use,Category_International and Specific Audiences,Category_Other,Category_Policy Change,Category_Third Party Sharing/Collection,"Category_User Access, Edit and Deletion",Category_User Choice/Control,Cleaned_Policy_text
0,14651,<strong> 1. What is the personal information t...,0,0,0,1,0,0,0,0,0,0,1 personal information collect depending choos...
1,22227,Appointment Information to schedule an appoint...,0,0,0,1,0,0,0,0,0,0,appointment information schedule appointment c...
2,176,Business Transitions <br> <br> In the event Re...,0,0,0,0,0,0,0,1,0,0,business transition event redorbit inc go busi...
3,16818,b. Cookies. <br> <br> (i) Hearst (or third par...,0,0,0,0,0,0,0,1,0,0,b cooky hearst third party service provider be...
4,20169,II. Sharing of information. <br> <br> We may s...,0,0,0,0,0,0,0,1,0,0,ii sharing information may share pii non perso...


In [10]:
val_policy_data.head()

Unnamed: 0,index,Policy_text,Category_Data Retention,Category_Data Security,Category_Do Not Track,Category_First Party Collection/Use,Category_International and Specific Audiences,Category_Other,Category_Policy Change,Category_Third Party Sharing/Collection,"Category_User Access, Edit and Deletion",Category_User Choice/Control,Cleaned_Policy_text
0,4291,<ul> <li> <strong> Opening an Account or Using...,0,0,0,0,0,0,0,1,0,0,opening account using chase paymentech service...
1,15427,We will not sell or rent the personally identi...,0,0,0,0,0,0,0,1,0,0,sell rent personally identifiable information ...
2,23013,<ul> <li> further our business or operational ...,0,0,0,1,0,0,0,0,0,0,business operational purpose example data anal...
3,18471,<strong> Collection of Information From Other ...,0,0,0,1,0,0,0,0,0,0,collection information third party source may ...
4,13879,<ul> <li> If you have created an account on on...,0,0,0,0,0,0,0,0,1,0,created account one website log account able e...


In [11]:
# Initializing countvectorizer
corpus = clean_policy_text
cv = CountVectorizer(max_features=5000)
vectorizer = TfidfVectorizer(max_features=5000)

#converting text from the train data and test data into vectors
# train_vector = cv.fit_transform(corpus)
data_vector = vectorizer.fit_transform(corpus)
print(data_vector.toarray())

# Performing vectorization on validation data
validation_vector = vectorizer.transform(val_clean_policy_text)
print(validation_vector.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.14482622 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


Model Training

In [13]:
# Generating the training dataframe
train_X = pd.DataFrame(data_vector.toarray(), columns = vectorizer.get_feature_names())
print(train_X.head())

# Generating validation dataframe
validation_X = pd.DataFrame(validation_vector.toarray(), columns = vectorizer.get_feature_names())
print(validation_X.head())

AttributeError: ignored

In [None]:
train_Y = train_policy_data[['Category_Data Retention',
       'Category_Data Security', 'Category_Do Not Track',
       'Category_First Party Collection/Use',
       'Category_International and Specific Audiences', 'Category_Other',
       'Category_Policy Change', 'Category_Third Party Sharing/Collection',
       'Category_User Access, Edit and Deletion',
       'Category_User Choice/Control']]

test_Y = val_label_data[['Category_Data Retention',
       'Category_Data Security', 'Category_Do Not Track',
       'Category_First Party Collection/Use',
       'Category_International and Specific Audiences', 'Category_Other',
       'Category_Policy Change', 'Category_Third Party Sharing/Collection',
       'Category_User Access, Edit and Deletion',
       'Category_User Choice/Control']]

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
clf = OneVsRestClassifier(SVC()).fit(train_X, train_Y)

In [None]:
y_pred = clf.predict(validation_X)

In [None]:
from sklearn.metrics import f1_score
f1_micro = f1_score(test_Y, y_pred, average='micro')
f1_macro = f1_score(test_Y, y_pred, average='macro')

In [None]:
f1_micro

In [None]:
f1_macro