# **DL Competition 1 Report**

---

**Student Name:** 葛奕宣 
**Student ID:** 113062574  

# Load Package

In [47]:
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

# Download required NLTK data files (only need to do this once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /home/ryanke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ryanke/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ryanke/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Load Data

In [48]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# Preprocessing and Feature Extraction:
---
我主要使用BS4進行feature extraction，以下是我提取出的feature(並沒有全數用到)
* datetime:
    * 透過datetime library進行datetime的轉換，並且把string都改成int(ex: wed to 3)，並且要handle沒有datetime文章的case，我只有在testcase中看到，我根據文章內容hard code了一個時間
* title
    * 從h1'title'取得
* article-topics
    * 從'footer.article-topics'取得
* \# of images
    * 用len()配上find_all取得
* article length
    * 先取得article text再用len()取得
* \# of links
    * 用len()配上find_all取得
* channel
    * 尋找'data-channel'

針對文字我都有進行tolower(), 並且有透過re library去remove punctuation

 

In [49]:
import re
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd  # Import pandas

# Function to remove punctuation from a string
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def extract_article_features(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Extract datetime and convert to year, month, day, hour, and weekday (1 for Monday, 7 for Sunday)
    time_element = soup.find('time')
    if time_element and time_element.has_attr('datetime'):
        # Extract datetime, convert to year, month, day, and hour
        datetime_str = time_element['datetime']
        try:
            # Update the format to match 'Wed, 19 Jun 2013 15:04:30 +0000'
            article_datetime = datetime.strptime(datetime_str, '%a, %d %b %Y %H:%M:%S %z')
            article_year = article_datetime.year
            article_month = article_datetime.month
            article_day = article_datetime.day
            article_weekday = article_datetime.isoweekday()  # Monday is 1, Sunday is 7
            article_hour = article_datetime.hour
        except ValueError:
            # Handle case where the datetime format is incorrect or missing
            article_year = 2024
            article_month = 1
            article_day = 9
            article_weekday = 4
            article_hour = 19
    else:
        article_year = 2024
        article_month = 1
        article_day = 9
        article_weekday = 4
        article_hour = 19

    # Extract title, convert to lowercase and remove punctuation
    title = soup.find('h1', class_='title').text.strip().lower() if soup.find('h1', class_='title') else None
    title = remove_punctuation(title) if title else None

    # Extract author, convert to lowercase and remove punctuation
    article_info = soup.head.find('div', {'class': 'article-info'})
    author_name = article_info.find('span', {'class': 'author_name'}) if article_info else None
    if author_name:
        author = author_name.get_text()
    elif article_info and article_info.span:
        author = article_info.span.string
    elif article_info and article_info.a:
        author = article_info.a.string
    else:
        author = None
    author = author.lower() if author else None
    if author and author.startswith('by '):
        author = author[3:]
    author = remove_punctuation(author) if author else None

    # Extract article topics, convert to lowercase and remove punctuation
    article_topics = ' '.join([remove_punctuation(topic.text.lower()) for topic in soup.select('footer.article-topics a')])

    # Count the number of images
    num_images = len(soup.find_all('img'))

    # Calculate article length (number of characters in the article)
    article_text = ''.join([p.text for p in soup.select('article p')]).lower()  # Convert article text to lowercase
    article_length = len(article_text)

    # Count the number of links
    num_links = len(soup.find_all('a'))

    # Extract channel and convert to lowercase
    article_channel = soup.find('article').get('data-channel', None)
    article_channel = article_channel.lower() if article_channel else None

    return {
        'year': article_year,
        'month': article_month,
        'day': article_day,
        'weekday': article_weekday,
        'hour': int(article_hour) if article_hour is not None else None,
        'title': title,
        'author': author,
        'topics': article_topics,
        'num_images': num_images,
        'article_length': article_length,
        'num_links': num_links,
        'channel': article_channel
    }

features = []
for page in train_data["Page content"]:
    features.append(extract_article_features(page))
for page in test_data["Page content"]:
    features.append(extract_article_features(page))

# Convert the list of feature dictionaries into DataFrames
combine_df = pd.DataFrame(features)

# Preprocessing
* 這邊主要針對文字進行tokenize跟lemmatization，兩者皆是用nltk的內建function
* 接著我使用了sklearn的columntransformer，他把轉換過程包起來，方便針對不同model reuse和客製化
* 這裡也是drop column的地方，在不斷實驗後決定drop掉num_image, num_links

In [50]:
# Define tokenizers
def tokenizer(text):
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

clf_transform = ColumnTransformer(
    transformers=[
        ('author_vect', CountVectorizer(tokenizer=tokenizer, lowercase=False), 'author'),
        ('topic_vect', CountVectorizer(tokenizer=tokenizer, lowercase=False), 'topics'),
        ('title_vect', CountVectorizer(tokenizer=tokenizer, lowercase=False), 'title'),
        ('channel_vect', CountVectorizer(tokenizer=tokenizer, lowercase=False), 'channel')
    ],
    remainder='passthrough',
    n_jobs=-1
)

combine_df.drop(columns=['num_images','num_links'], inplace=True)
combine_df = clf_transform.fit_transform(combine_df)



In [51]:
from sklearn.model_selection import train_test_split

train_df = combine_df[:len(train_data)].astype(np.float32)
test_df = combine_df[len(train_data):].astype(np.float32)
y_train_all = (train_data['Popularity'].values == 1).astype(np.float32)

X_train, X_valid, y_train, y_valid = train_test_split(
    train_df, y_train_all, test_size=0.2, random_state=42)


In [52]:
# from sklearn.model_selection import GridSearchCV, StratifiedKFold
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_auc_score
# import numpy as np

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 500],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
# }

# # Cross-validation strategy
# cv_strategy = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# # Initialize the RandomForestClassifier
# rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# # Initialize GridSearchCV
# grid_search = GridSearchCV(
#     estimator=rf,
#     param_grid=param_grid,
#     scoring='roc_auc',
#     cv=cv_strategy,
#     n_jobs=-1,
#     verbose=2
# )

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Best parameters and estimator
# best_params = grid_search.best_params_
# best_estimator = grid_search.best_estimator_
# print("Best parameters found: ", best_params)

# # Evaluate on training data
# y_train_pred_proba = best_estimator.predict_proba(X_train)[:, 1]
# train_auc = roc_auc_score(y_train, y_train_pred_proba)
# print('Train ROC AUC of best estimator:', train_auc)

# # Evaluate on validation data
# y_valid_pred_proba = best_estimator.predict_proba(X_valid)[:, 1]
# valid_auc = roc_auc_score(y_valid, y_valid_pred_proba)
# print('Validation ROC AUC of best estimator:', valid_auc)


# Training
* 我使用model ensemble的方式，實作方式是用VotingClassifier
* Model的挑選上，我嘗試了四個比較有名的model, random forest, lightgbm, catboost, XGboost
* 其中XGboost的分數明顯比其他低，所以我放棄了他(code註解掉)
* 在Model tuning的部分，我有用gridsearch, random search去嘗試不同參數的組合，其中learning_rate, n_estimator是我認為最重要的參數，調整後模型分數顯著提高

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
print(roc_auc_score(y_train,rf.predict_proba(X_train)[:, 1]))
print(roc_auc_score(y_valid,rf.predict_proba(X_valid)[:, 1]))

1.0
0.5818678872078772


In [54]:
# from lightgbm import LGBMClassifier

# # Cross-validation strategy
# cv_strategy = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# # Initialize the LGBMClassifier
# lg = LGBMClassifier(random_state=42, n_jobs=-1)

# # Define the parameter grid
# param_grid = {
#     'num_leaves': [31, 50, 70],
#     'max_depth': [-1, 10, 20, 30],
#     'learning_rate': [0.1, 0.05, 0.01],
#     'n_estimators': [100, 200, 500],
# }

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=lg,
#     param_distributions=param_grid,
#     n_iter=50,
#     scoring='roc_auc',
#     cv=cv_strategy,
#     random_state=42,
#     n_jobs=-1,
#     verbose=2
# )

# # Fit the random search to the data
# random_search.fit(X_train, y_train)

# # Best parameters and estimator
# best_params = random_search.best_params_
# best_estimator = random_search.best_estimator_
# print("Best parameters found: ", best_params)

# # Evaluate on training data
# y_train_pred_proba = best_estimator.predict_proba(X_train)[:, 1]
# train_auc = roc_auc_score(y_train, y_train_pred_proba)
# print('Train ROC AUC of best estimator:', train_auc)

# # Evaluate on validation data
# y_valid_pred_proba = best_estimator.predict_proba(X_valid)[:, 1]
# valid_auc = roc_auc_score(y_valid, y_valid_pred_proba)
# print('Validation ROC AUC of best estimator:', valid_auc)


In [55]:
from lightgbm import LGBMClassifier

lg = LGBMClassifier(n_estimators=40, random_state=42, n_jobs=-1,learning_rate=0.05)
lg.fit(X_train, y_train)
print(roc_auc_score(y_train,lg.predict_proba(X_train)[:, 1]))
print(roc_auc_score(y_valid,lg.predict_proba(X_valid)[:, 1]))

[LightGBM] [Info] Number of positive: 10916, number of negative: 11198
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.121998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6792
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 2684
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493624 -> initscore=-0.025506
[LightGBM] [Info] Start training from score -0.025506
0.666185359946559
0.593456859510363


In [56]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier(n_estimators=500, random_state=42, verbose=0, learning_rate=0.01)
cb.fit(X_train, y_train)
print(roc_auc_score(y_train,cb.predict_proba(X_train)[:, 1]))
print(roc_auc_score(y_valid,cb.predict_proba(X_valid)[:, 1]))

0.6270849720848046
0.588237757895569


In [57]:
# from xgboost import XGBClassifier
# # xgb = XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1)
# xgb.fit(X_train, y_train) 
# print(roc_auc_score(y_train,xgb.predict_proba(X_train)[:, 1]))
# print(roc_auc_score(y_valid,xgb.predict_proba(X_valid)[:, 1]))

# 

# Voting Classifier
* 進行training和validation
* 原本沒有進行weighting, 但嘗試過後，把分數較高的model提高了比例，準確度上升很多!

In [58]:
# combine the model
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('rf', rf), ('lg', lg), ('cb', cb)], voting='soft', weights=[0.01, 1, 0.5])
voting_clf.fit(X_train, y_train)
print(roc_auc_score(y_train,voting_clf.predict_proba(X_train)[:, 1]))
print(roc_auc_score(y_valid,voting_clf.predict_proba(X_valid)[:, 1]))

[LightGBM] [Info] Number of positive: 10916, number of negative: 11198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6792
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 2684
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493624 -> initscore=-0.025506
[LightGBM] [Info] Start training from score -0.025506
0.688523095490734
0.5956656633649682


# Prediction
* 最後使用全部資料進行訓練
* 進行test prediction

In [59]:
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
lg = LGBMClassifier(n_estimators=40, random_state=42, n_jobs=-1,learning_rate=0.05)
cb = CatBoostClassifier(n_estimators=500, random_state=42, verbose=0, learning_rate=0.01)

voting_clf = VotingClassifier(estimators=[('rf', rf), ('lg', lg), ('cb', cb)], voting='soft', weights=[0.1, 1, 0.5])
voting_clf.fit(train_df, y_train_all)
print(roc_auc_score(y_train_all,voting_clf.predict_proba(train_df)[:, 1]))

[LightGBM] [Info] Number of positive: 13632, number of negative: 14011
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.084726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8068
[LightGBM] [Info] Number of data points in the train set: 27643, number of used features: 3219
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493145 -> initscore=-0.027423
[LightGBM] [Info] Start training from score -0.027423
0.6761680355609258


In [60]:
final_y = voting_clf.predict_proba(test_df)[:, 1]

df_pred = pd.DataFrame({'Id': test_data['Id'], 'Popularity': final_y})
df_pred.to_csv('prediciton.csv', index=False)

# Conclusion
* 這次作業讓我意識到原來Feature Extraction和Model tuning竟然能讓模型的分數差這麼多
* 我認為最大的pitfall就是使用大量的段落文字，我嘗試過第一段、全部文章，效果都非常差，我想可能是資料的數量不夠多，所以文字跟結果的關聯性非常低
* 學習到了bs4這個強大的HTML提取package，以及re處理文字的package，兩者結合起來讓我對分析網站內容更加熟悉
* 上網學到了model ensemble的技術，在資源足夠的情況下結合多個model確實能有幫助
* 接觸到了不同的classifier, 其中LightGBM最令我驚豔，速度快很多，準確度也是最高的
* 最後是我原本以為votingclassifier若傳入已經train過的模型，會直接使用，上網查後原來都會retrain