In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import csv
import re

In [2]:
# concatenate all data
popup = pd.read_csv('/Users/SeoyeonHong/Desktop/annoying_ad_classifier/data_collection/data/popup_ad_code.csv')
sticky = pd.read_csv('/Users/SeoyeonHong/Desktop/annoying_ad_classifier/data_collection/data/sticky_ad_code.csv')
acceptable = pd.read_csv('/Users/SeoyeonHong/Desktop/annoying_ad_classifier/data_collection/data/acceptable_ad_code.csv')

df = pd.concat([popup, sticky, acceptable])

In [3]:
# retrieve divs, classes, styles
divs = df['n_divs'].to_list()
classes = df['n_classes'].to_list()
styles = df['n_style'].to_list()

In [4]:
# put together all divs, classes, styles for creating final dataset 
data = [str([i, j, k]) for i, j, k in zip(divs, classes, styles)]
df["data"] = data

In [5]:
# remove noise from data 
df["data"] = df['data'].str.replace('[^\w\s]',' ').replace('xa0',' ').replace(' t ',' ').replace(' n ',' ')
df["data"] = df["data"].replace(r'\\',' ', regex=True).replace(r'\\n',' ', regex=True)
df["data"] = df["data"].str.replace(r'\b\w\b',' ').str.replace(r'\s+', ' ')

In [6]:
# set stopwords
from nltk.corpus import stopwords
eng_stop = stopwords.words('english')
stopword = ["u200c", "u200c6", 'u200c60', '𐌰𐌽𐌰𐌵𐌹𐍃𐍃'] + eng_stop

In [7]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# creating index for dataset
index = []
i = 1
for i in range(1,453):
    index.append(f"URL{i}")
    i = i+1

In [9]:
# using CountVectorizer to transform data for machine learning input
corpora = df['data']

vec = CountVectorizer(encoding='utf-8', decode_error= 'strict', strip_accents = 'unicode', stop_words = stopword)
wm = vec.fit_transform(corpora)
feat_names = vec.get_feature_names()
index_names = index
cvec = pd.DataFrame(data = wm.toarray(), index = index_names, columns = feat_names)

In [10]:
cvec

Unnamed: 0,00,000,000000,000000e,0000043px,000004px,00001px,001em,0023,0034px,...,zlotych,zonedmodule,zoom,zopim,zrmzoje,zte,zu4egz,zuqda9b,zvlbjap,zxx8xmr
URL1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
URL2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
URL3,0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
URL4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
URL5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
URL448,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
URL449,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
URL450,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
URL451,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# inserting Ad types (dependent variable) into cvec

labels = df['TYPE'].to_list()
cvec.insert( 0 , 'TYPE', labels, allow_duplicates=False)

In [12]:
cvec['TYPE'] # acceptable ads are also referred to better ads

URL1       popup
URL2       popup
URL3       popup
URL4       popup
URL5       popup
           ...  
URL448    better
URL449    better
URL450    better
URL451    better
URL452    better
Name: TYPE, Length: 452, dtype: object

In [13]:
cvec.to_csv('/Users/SeoyeonHong/Desktop/annoying_ad_classifier/ad_vectors.csv') # saving file