## Import libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Read data

In [2]:
df = pd.read_csv('./dataset/train.csv')
df_test = pd.read_csv('./dataset/test.csv')

df.head(5)

Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


## Preprocessing

In [3]:
import re
from bs4 import BeautifulSoup

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [4]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

def tokenizer_lemma_nostop(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsuan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsuan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from datetime import datetime

In [6]:
# 定義一個獲取文本長度的函數
def get_text_length(X):
    return np.array([len(text) for text in X]).reshape(-1, 1)

# 定義一個函數來獲取 HTML 中的圖像數量
def get_N_img(X):
    n_img = []
    for html_content in X:
        soup = BeautifulSoup(html_content, 'html.parser')
        n_img.append([len(soup.find_all('img'))])  # 找到所有 <img> 標籤並計算數量
    return np.array(n_img).reshape(-1, 1)

# 定義一個函數來獲取 HTML 中的鏈接數量
def get_N_link(X):
    n_link = []
    for html_content in X:
        soup = BeautifulSoup(html_content, 'html.parser')
        n_link.append([len(soup.find_all('a'))])  # 找到所有 <a> 標籤並計算數量
    return np.array(n_link).reshape(-1, 1)

# 定義一個函數來判斷是否是週末
def get_weekday(X):
    weekdays = []
    for html_content in X:
        soup = BeautifulSoup(html_content, 'html.parser')
        time_tag = soup.find('time')  # 尋找時間標籤

        if time_tag:
            datetime_str = re.search(r'\d{4}-\d{2}-\d{2}', time_tag.text)
            if datetime_str:
                date = datetime.strptime(datetime_str.group(), '%Y-%m-%d').date()
                weekdays.append([date.weekday()])
            else:
                weekdays.append([0])
        else:
            weekdays.append([0])
            
    return np.array(weekdays)

In [7]:
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(ngram_range=(3, 4)), 'Page content'),  # TF-IDF 特徵
        ('length', FunctionTransformer(func=get_text_length, validate=False), 'Page content'),
        ('n_img', FunctionTransformer(func=get_N_img, validate=False), 'Page content'),
        ('n_link', FunctionTransformer(func=get_N_link, validate=False), 'Page content'),
        ('weekday', Pipeline([
            ('get_weekday', FunctionTransformer(func=get_weekday, validate=False)),
            ('one_hot', OneHotEncoder())
        ]), 'Page content'),
    ]
)

In [8]:
x_train, x_val, y_train, y_val = train_test_split(df, df['Popularity'], test_size=0.3, random_state=0)


In [9]:
x_train_prep = preprocessor.fit_transform(x_train)
x_val_prep = preprocessor.transform(x_val)
#y_train = df['Popularity']

## Training 

### logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(solver = "liblinear")
clf.fit(x_train_prep, y_train)

y_train

18744   -1
8253     1
18173   -1
25940   -1
27392   -1
        ..
13123   -1
19648   -1
9845    -1
10799   -1
2732     1
Name: Popularity, Length: 19350, dtype: int64

In [None]:
y_pred = clf.predict(x_val_prep)
score = roc_auc_score(y_val, y_pred)

print('logistic regression val auc: %f' % score)  

logistic regression val auc: 0.551266


## Predict

In [None]:
x_test_prep = preprocessor.transform(df_test)

In [None]:
y_test_pred = clf.predict_proba(x_test_prep)
df_test['Popularity'] = y_test_pred[:,1]
df_test.head()


In [None]:
result = df_test[['Id', 'Popularity']]
result.to_csv('output.csv', index=False)