# 伯伯會社
## 組員
## 江伯耕 108024517
## 蔣嘉霖 108024514
## 周秉儒 107024703
## 陳炘昱 107024701

In [None]:
#%%
%matplotlib inline
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np
import scipy as sp
import spacy
from spacy.lang.en import English
from scipy.sparse import csr_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import datetime as datetime
from datetime import date
import pytz as pytz
from gensim import corpora,models
import pickle
import holidays
from nltk.tokenize import word_tokenize 
from nltk.corpus import wordnet
import gc
from sklearn.preprocessing import StandardScaler

# 一、建構feature 

## 首先去除html tags後，將文章的內容做tokenization處理，並利用lemmatization做詞性的還原以及刪除Stop-words後抓取下列的feature： 
1. 標題(Bag-Of-Words) 
2. 作者姓名 (one hot encoding) 
3. 時間 
4. 文章channel (one hot encoding) 
5. 文章底部標籤 (Bag-Of-Words) 
6. 文章內容 (Latent Dirichlet Allocation) 
7. figure caption內容 (Bag-Of-Words) 
8. Seealso 數量 
9. Insragram數量 
10. Twitter數量
11. mashable連結數量
12. 文章長度 
13. 文章字母數 
14. 文章句子數 
15. 文章平均字長 
16. 文章平均句長 
17. 文章圖片數 
18. Bonus數量 
19. 文章斜體字數量 
20. 文章gallery數量 

In [5]:
df_raw = pd.read_csv("C:\\Users\\stat_pc\\Desktop\\深度學習\\Competition 01\\train.csv") 
df_raw_test = pd.read_csv("C:\\Users\\stat_pc\\Desktop\\深度學習\\Competition 01\\test.csv")

In [None]:
#%%
parser = English()
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
stop_word = nltk.corpus.stopwords.words('english')
my_stop_word = ["also","would","year","time","take","could","make","show",
                "know","like","associate","even","much","many","much","something",
                "things","look","said","says","say","on","in","it","our",
                "an","and","are","at","for","of","he"]
for i in my_stop_word:
    stop_word.append(i)
en_stop = set(stop_word)
analyzer = CountVectorizer().build_analyzer()
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
def lemma_words(doc):
    lm_wd = []
    for w in analyzer(doc):
        tag = nltk.pos_tag([w])
        tag = get_wordnet_pos(tag[0][1]) or wordnet.NOUN
        lm_wd.append(WordNetLemmatizer().lemmatize(w,pos =tag))
    return (lm_wd)
def get_lemma2(word):
    tag = nltk.pos_tag([word])
    tag = get_wordnet_pos(tag[0][1]) or wordnet.NOUN
    return WordNetLemmatizer().lemmatize(word,pos=tag)
def get_lemma3(word):
    words = re.split('\s+', word)
    if len(words)==1:
        return(get_lemma2(word))
    else :
        sp_word = lemma_words(word)
        sp_word = " ".join(sp_word)
        return(sp_word)
def tokenize(text):
    
    lda_tokens = []
    if len(text)>1000000:
        text = text[:1000000]
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('BLOG_ID')
        else:
            token = token.lower_
            lda_tokens.append(token)
    return(lda_tokens)
def prepare_text_for_lda(text):
    
    tokens = tokenize(text)
    tokens =[token for token in tokens if len(token)>3]
    tokens =[token for token in tokens if token not in en_stop]
    return tokens
def transform_to_data_mat(lda_document_distribution,k):
    outcome= []
    for distribution in lda_document_distribution:
        outcome_mat = np.zeros(k)
        for i,j in distribution:
            outcome_mat[i]=j
        outcome.append(outcome_mat)
    outcome = [w for w in outcome if w !=[]]
    return(np.array(outcome))
def text_to_token(text):
    text_data = []
    #ps = PorterStemmer()
    for i in text:
        tokens =prepare_text_for_lda(i)
        #lemmatization
        tokens=[get_lemma2(w) for w in tokens]
        text_data.append(tokens)
    return(text_data)
def lda_modeling_outcome(token_text,k):
    dictionary =corpora.Dictionary(token_text)
    corpus = [dictionary.doc2bow(text) for text in token_text]
    pickle.dump(corpus,open('corpus.pkl','wb'))
    dictionary.save('dictionary.gensim')
    NUM_TOPICS = k
    ldamodel = models.LdaModel(corpus,num_topics = NUM_TOPICS,
                                    id2word=dictionary,passes=5)
    final_outcome = ldamodel.get_document_topics(corpus)
    outcome = transform_to_data_mat(final_outcome,NUM_TOPICS)
    return(outcome)
def feature_ect(text) :
    soup = BeautifulSoup(text,"html.parser")
    # 抓取圖片 caption
    # tag = soup.figcaption
    # if(tag is None):
    #     tag = "no_content_error"
    # else:
    #     if(soup.figcaption.string is None):
    #         tag = "no_content_error"
    #     else:    
    #         tag = soup.figcaption.get_text()
    # 抓取標題
    title = soup.h1.get_text().lower()
    n = len(title)
    title = re.sub(";|'.'|#|,|’s|'s|'","",title,n)
    title_len = len(re.split("\s+",title))
    # 抓取時間字串
    d = soup.find("time") # time 
    d_string = str(d.get_text())
    # 抓取姓名
    name = soup.span
    # 檢驗作者姓名是否存在
    if soup.span is None:
        name = soup.select('div>a')[0].get_text()
    else :
        name = name.get_text()
    # 檢驗作者姓名內是否有額外字串
    catch_name_problem = re.match(string=name,pattern = "[" "|b|B][Y|y] .*?([0-9]|\,)")
    if catch_name_problem is None:
        name=name
    else:
        name = name[(catch_name_problem.regs[0][0]+3):(catch_name_problem.regs[0][1]-1)]
    # 抓取底部 topic
    topic_group = [get_lemma3(i.get_text().lower() )for i in soup.footer.find_all('a') ]
    # topic length (還沒寫)
    topic_group_len = len(topic_group)
    #topic 黏成字串，方便 countvector
    topic_group = "_".join(topic_group) 

    # 抓取 cate 並且將所有變數併入
    # 0:reporter name,1:topic,2:cate,3:time,4:title,5:figure caption,6:topic len,7:title len
    outcome = [name.lower(),topic_group,
            soup.article.attrs['data-channel'].lower(),
            d_string,title.lower(),0,topic_group_len,title_len]
    return(outcome)

def data_precess_routine(train,test):
    # numer of training data
    n_train = train.shape[0]
    # 合併 train and test
    Y_train = train['Popularity']
    dtf = pd.concat([train[["Id","Page content"]],test],ignore_index=True)
    # 產生 list 放置新變數
    ft_list = []
    for i in np.arange(dtf.shape[0]):
        ft_list.append(feature_ect(dtf["Page content"][i])) 
    # 將新變數的list 轉為 numpy list 
    ft_list= np.array(ft_list)
    # 將時區做合適的轉換
    # 切割時間點
    year = [];month = [];day = [];hour = [];hday = [];date=[]
    for i in range(ft_list.shape[0]):
        time = ft_list[i,3]
        out = re.search(string = time,pattern = "[0-9]$")
        if out is not None :
            time=datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S %z").astimezone(tz=pytz.utc)
        else :
            if time=='':
                time= datetime.datetime.strptime('2000-01-01 00:00:00',"%Y-%m-%d %H:%M:%S").astimezone(tz=pytz.utc)
            else :
                time=re.sub("UTC","-0000",time,1)
                time=datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S %z").astimezone(tz=pytz.utc)
        
        time1 = time.strftime("%Y-%m-%d %H:%M:%S")
        time2 = time.strftime("%Y-%m-%a %H:%M:%S")
        year.append(time2[0:4]);month.append(time2[5:7]);hour.append(time2[12:14]);date.append(time2[8:11])
        day.append(time1[8:10]);hday.append(time1 in holidays.US())
    # topic 轉成 matrix，更改 max_df 與 min_df
    count_transform = CountVectorizer(tokenizer = lambda x: x.split("_"), analyzer="word",max_df=39470,min_df=20)
    topic_mat = count_transform.fit_transform(ft_list[:,1])
    # title 轉換成 matrix，更改 max_df 與 min_df
    title_cvtr = CountVectorizer(tokenizer=word_tokenize,analyzer=lemma_words,stop_words=stop_word,max_df=39470,min_df=20)
    title_mat =  title_cvtr.fit_transform(ft_list[:,4])
    # figure acption 轉換成 matrix，更改 max_df 與 min_df
    #figtag_cvtr = CountVectorizer(tokenizer=word_tokenize,analyzer=lemma_words,stop_words=stop_word,max_df=39470,min_df=20)
    #figtag_mat =  figtag_cvtr.fit_transform(ft_list[:,5])
    # 輸出dataframe
    # 並在此增加連續的日，以月中為切割點，為 -1~1 之間的數字
    ft_frame = {
    'Id':dtf['Id'],'reporter_name':ft_list[:,0],'Cate':ft_list[:,2],'Year':year,
    'Month':month,'Date':date,'Hour':hour,'Holiday':hday,
    'day_conti':[ (int(i)-15)/31 for i in day ],
    'day_conti_2':[ ((int(i)-15)/31)**2 for i in day ],
    'Topic length':[int(i) for i in ft_list[:,6]],
    "title length": [int(i) for i in ft_list[:,7]]
    }
    ft_frame = pd.DataFrame(ft_frame)
    # 與 topic 合併
    ft_frame=pd.concat([ft_frame,
                        pd.DataFrame(topic_mat.toarray()),
                        pd.DataFrame(title_mat.toarray())],axis=1)
    # 轉換 one-hot
    ft_frame=pd.get_dummies(ft_frame,columns=["reporter_name","Cate","Year","Month","Hour","Date","Holiday"])
    # 切出 train
    train_frame = pd.concat([Y_train,ft_frame[:n_train]],axis=1)
    return((train_frame,ft_frame[n_train:]))
def text_lda(train,test,k):
    n_train = train.shape[0]
    data = full = pd.concat([train,test],ignore_index=True)
    full_text = []
    for text in data['Page content']:
        soup = BeautifulSoup(text,"html.parser")
        inner_text = soup.select("article>section")[0].get_text()
        full_text.append(inner_text)
    token_text = text_to_token(full_text)
    lda_outcome = lda_modeling_outcome(token_text,k=k)
    lda_outcome  = pd.DataFrame(lda_outcome)
    return((lda_outcome[:n_train],lda_outcome[n_train:]))
def data_process(train,test,k):
    train_process_part1 = data_precess_routine(train = train,test=test)
    train_process_part2 = text_lda(train = train,test=test,k=k)
    train_process = pd.concat([train_process_part1[0],train_process_part2[0]],axis=1)
    test_process = pd.concat([train_process_part1[1],train_process_part2[1]],axis=1)
    return((train_process,test_process))
def feature(df):
    import re
    from bs4 import BeautifulSoup
    def preprocessor(text):
        # remove HTML tags
        text = BeautifulSoup(text, 'html.parser').get_text()
        text = re.sub('[\W]+', ' ', text.lower()) + ' '
        return text
    def tokenizer(text):
        return re.split('\s+', text.strip())

    lengthem = [];lengthSeealso = [];lengthBonus = [];lengthGallery = [];length = []
    lengthimg = [];lengthmashable = [];lengthIG = [];lengthtwitter = []
    lengthsentence = [];lengthcharacter = [];average_word_length = [];average_sentence_length = []
    for i in range(df.shape[0]):
        soup = BeautifulSoup(df.loc[i,"Page content"], 'html.parser')
        # twitter-tweet
        tag = soup.find_all("blockquote", class_="twitter-tweet")
        if(tag is None):
            lengthtwitter.append(0)
        else:
            lengthtwitter.append(len(tag))
        # instagram.com number
        tag = soup.find_all(href=True)
        IGcontent = []
        for i in range(len(tag)):
            if("instagram.com" in tag[i]["href"]):
                IGcontent.append(i)
        if(IGcontent is None):
            lengthIG.append(0)
        else:
            lengthIG.append(len(IGcontent))
        # mashable.com number
        tag = soup.find_all(href=True)
        mashablecontent = []
        for i in range(len(tag)):
            if("mashable.com" in tag[i]["href"]):
                mashablecontent.append(i)
        if(mashablecontent is None):
            lengthmashable.append(0)
        else:
            lengthmashable.append(len(mashablecontent))
        # em  
        tag = soup.find_all("em")
        if(tag is None):
            lengthem.append(0)
        else:
            lengthem.append(len(tag))
        # bonus    
        tag = soup.find("div", class_="bonus-content")
        if(tag is None):
            lengthBonus.append(0)
        else:
            lengthBonus.append(len(tag))   
        #  gallery
        tag = soup.find("section", class_="gallery")
        if(tag is None):
            lengthGallery.append(0)
        else:
            lengthGallery.append(len(tag("li", class_="slide")))
        # See also
        tag = soup.find("article").get_text().lower()
        length_of_see_also = len(re.findall(pattern = "see also:",string=tag))
        lengthSeealso.append(length_of_see_also)
        # 文章字母數
        length_of_character = sum(len(word) for word in tag.split(" "))
        lengthcharacter.append(length_of_character)
        # 文章長度
        length_of_article = len(tag.split(" "))
        length.append(length_of_article)
        # 文章句子數
        length_of_sentence = len(tag.split("."))
        lengthsentence.append(length_of_sentence)
        # 文章平均字長
        avg_w = length_of_character/length_of_article
        average_word_length.append(avg_w)
        # 文章平均句長
        avg_s = length_of_article/length_of_sentence
        average_sentence_length.append(avg_s)
        # img
        tag = soup.find_all("img")
        if(tag is None):
            lengthimg.append(0)
        else:
            lengthimg.append(len(soup.find_all("img")))
    
    # twitter quote number    
    lengthtwitter = pd.DataFrame(lengthtwitter)
    lengthtwitter.columns = ["twitter quote number"]
    # Mashable number
    lengthmashable = pd.DataFrame(lengthmashable)
    lengthmashable.columns = ["Mashable number"]
    # IG number
    lengthIG = pd.DataFrame(lengthIG)
    lengthIG.columns = ["IG number"]
    # em
    lengthem = pd.DataFrame(lengthem)
    lengthem.columns = ["em length"]
    # seealso
    lengthSeealso = pd.DataFrame(lengthSeealso)
    lengthSeealso.columns = ["Seealso length"]
    #bonus
    lengthBonus = pd.DataFrame(lengthBonus)
    lengthBonus.columns = ["bonus length"]
    #gallery
    lengthGallery = pd.DataFrame(lengthGallery)
    lengthGallery.columns = ["Gallery length"]
    # 文章 length
    length = pd.DataFrame(length)
    length.columns = ["article length"]
    # sentence length
    lengthsentence = pd.DataFrame(lengthsentence)
    lengthsentence.columns = ["length of sentence"]
    # character length 
    lengthcharacter = pd.DataFrame(lengthcharacter)
    lengthcharacter.columns = ["Character length"]
    # average word length 
    average_word_length=pd.DataFrame(average_word_length)
    average_word_length.columns = ["average word length"]
    #  Average sentence length 
    average_sentence_length =pd.DataFrame(average_sentence_length)
    average_sentence_length.columns = ["average sentence length"]
    #img
    lengthimg = pd.DataFrame(lengthimg)
    lengthimg.columns = ["Img length"]

    data = pd.concat([lengthem,lengthSeealso,lengthBonus,lengthGallery,
                    length,lengthimg,lengthmashable,lengthIG,lengthtwitter,
                    lengthsentence,lengthcharacter,average_word_length,average_sentence_length],
                    axis=1)
    return(data)
#%%
def feature_ect_title(text) :
    soup = BeautifulSoup(text,"html.parser")
    # 抓取標題
    title = soup.h1.get_text().lower()
    n = len(title)
    title = re.sub(";|'.'|#|,|’s|'s|'","",title,n)
    outcome = title.lower()
    return(outcome)

full_data = pd.concat([df_raw,df_raw_test],axis = 0,ignore_index=True)
ls_title = []
for i in range(full_data.shape[0]):
    print(i)
    ls_title.append( feature_ect_title(full_data["Page content"][i]))
ls_title = np.array(ls_title)
title_cvtr = CountVectorizer(tokenizer=word_tokenize,analyzer=lemma_words)
title_mat =  title_cvtr.fit_transform(ls_title)
title_mat_array = title_mat.toarray()

#%%
def feature_ect_fig_content(text):
    soup = BeautifulSoup(text,"html.parser")
    # 抓取圖片 caption
    tag = soup.figcaption
    if(tag is None):
        tag = "no_content_error"
    else:
        if(soup.figcaption.string is None):
            tag = "no_content_error"
        else:    
            tag = soup.figcaption.get_text()
    return tag
ls_pg_content = []
for i in range(full_data.shape[0]):
    print(i)
    ls_pg_content.append(feature_ect_fig_content(full_data['Page content'][i]) )
ls_pg_content = np.array(ls_pg_content)
figtag_cvtr = CountVectorizer(tokenizer=word_tokenize,analyzer=lemma_words,stop_words=stop_word)
figtag_mat =  figtag_cvtr.fit_transform(ls_pg_content)
figtag_cvtr.get_feature_names()

#%%
def feature_ect_footer(text) :
    soup = BeautifulSoup(text,"html.parser")
    # 抓取底部 topic
    topic_group = [get_lemma3(i.get_text().lower() )for i in soup.footer.find_all('a') ]
    #topic 黏成字串，方便 countvector
    topic_group = "_".join(topic_group) 
    outcome = topic_group
    return(outcome)
ls_footer = []
for i in range(full_data.shape[0]):
    print(i)
    ls_footer.append(feature_ect_footer(full_data['Page content'][i]) )
ls_footer = np.array(ls_footer)
count_transform = CountVectorizer(tokenizer = lambda x: x.split("_"), analyzer="word")
topic_mat = count_transform.fit_transform(ls_footer)
count_transform.get_feature_names()

#%%
def week_day_to_int(week_day):
    if week_day=="Mon":
        return(1)
    elif week_day=="Tue":
        return(2)
    elif week_day=="Wed":
        return(3)
    elif week_day=="Thu":
        return(4)
    elif week_day=="Fri":
        return(5)
    elif week_day=="Sat":
        return(6)
    elif week_day=="Sun":
        return(7)
        
def feature_ect_other(text):
    soup = BeautifulSoup(text,"html.parser")
    # 抓取標題
    title = soup.h1.get_text().lower()
    n = len(title)
    title = re.sub(";|'.'|#|,|’s|'s|'","",title,n)
    title_len = len(re.split("\s+",title))
    # 抓取時間字串
    d = soup.find("time") # time 
    d_string = str(d.get_text())
    # 抓取姓名
    name = soup.span
    # 檢驗作者姓名是否存在
    if soup.span is None:
        name = soup.select('div>a')[0].get_text()
    else :
        name = name.get_text()
    # 檢驗作者姓名內是否有額外字串
    catch_name_problem = re.match(string=name,pattern = "[" "|b|B][Y|y] .*?([0-9]|\,)")
    if catch_name_problem is None:
        name=name
    else:
        name = name[(catch_name_problem.regs[0][0]+3):(catch_name_problem.regs[0][1]-1)]
    # 抓取底部 topic
    topic_group = [get_lemma3(i.get_text().lower() )for i in soup.footer.find_all('a') ]
    # topic length 
    topic_group_len = len(topic_group)
    #topic 黏成字串，方便 countvector
    #topic_group = "_".join(topic_group) 

    # 抓取 cate 並且將所有變數併入
    # 0:reporter name,1:cate,2:time,3:topic len,4:title len
    outcome = [name.lower(),soup.article.attrs['data-channel'].lower(),
            d_string,topic_group_len,title_len]
    return(outcome)

def data_precess_routine_other(train,test):
    # numer of training data
    n_train = train.shape[0]
    # 合併 train and test
    Y_train = train['Popularity']
    dtf = pd.concat([train[["Id","Page content"]],test],ignore_index=True)
    # 產生 list 放置新變數
    ft_list = []
    for i in np.arange(dtf.shape[0]):
        ft_list.append(feature_ect_other(dtf["Page content"][i])) 
    # 將新變數的list 轉為 numpy list 
    ft_list= np.array(ft_list)
    # 將時區做合適的轉換
    # 切割時間點
    year = [];month = [];day = [];hour = [];hday = [];date=[]
    for i in range(ft_list.shape[0]):
        time = ft_list[i,2]
        out = re.search(string = time,pattern = "[0-9]$")
        if out is not None :
            time=datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S %z").astimezone(tz=pytz.utc)
        else :
            if time=='':
                time= datetime.datetime.strptime('2000-01-01 00:00:00',"%Y-%m-%d %H:%M:%S").astimezone(tz=pytz.utc)
            else :
                time=re.sub("UTC","-0000",time,1)
                time=datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S %z").astimezone(tz=pytz.utc)
        
        time1 = time.strftime("%Y-%m-%d %H:%M:%S")
        time2 = time.strftime("%Y-%m-%a %H:%M:%S")
        year.append(time2[0:4]);month.append(time2[5:7]);hour.append(time2[12:14]);date.append(time2[8:11])
        day.append(time1[8:10]);hday.append(time1 in holidays.US())
    # 輸出dataframe
    # 並在此增加連續的日，以月中為切割點，為 -1~1 之間的數字

    ft_frame = {
    'Id':dtf['Id'],'reporter_name':ft_list[:,0],
    'Cate':ft_list[:,1],
    'Year':[int(i)-2013 for i in year],
    'Month':[ (int(i)-6)/12 for i in month],
    'Month_2':[ ((int(i)-6)/12)**2 for i in month],
    'Date':[(week_day_to_int(i)-3.5)/7 for i in date],
    'Date_2':[((week_day_to_int(i)-3.5)/7)**2 for i in date],
    'Hour':[(int(i)-12)/24 for i in hour],
    'Hour_2':[ ((int(i)-12)/24)**2  for i in hour],
    'Holiday':[int(i) for i in hday],
    'day_conti':[ (int(i)-15)/31 for i in day ],
    'day_conti_2':[ ((int(i)-15)/31)**2 for i in day ],
    'Topic length':[int(i) for i in ft_list[:,3]],
    "title length": [int(i) for i in ft_list[:,4]]
    }
    ft_frame = pd.DataFrame(ft_frame)
    # 轉換 one-hot
    ft_frame=pd.get_dummies(ft_frame,columns=["reporter_name","Cate"])
    # 切出 train
    train_frame = pd.concat([Y_train,ft_frame[:n_train]],axis=1)
    return((train_frame,ft_frame[n_train:]))
#%%

# 二、挑選feature

## 建完feature後再根據feature(1)、(5)、(6)、(7)、(8)~(20)及其他分成6組，分別觀察各組XGboost tree的importances，以importances當依據挑選變數。 


In [1]:
import scipy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV,KFold
import lightgbm as lgb
import matplotlib.pyplot as plt
import xgboost as xgb
import xgboost
from xgboost import XGBClassifier
import os
from scipy.sparse import csr_matrix

In [2]:
testdata=pd.read_csv('C:/Users/stat-pc/Desktop/10901/deep_learning/contest1/data/test.csv/test.csv')
df=pd.read_csv('C:/Users/stat-pc/Desktop/10901/deep_learning/contest1/data/train.csv/train.csv')
testID=testdata["Id"]
y=df["Popularity"]


In [4]:
# #%%
# a=pd.read_csv('title_colnames.csv').iloc[np.argsort(model.feature_importances_)[::-1],:]
# a['scor']=model.feature_importances_[np.argsort(model.feature_importances_)[::-1]]

# # %%
# c=pd.read_csv('topic_colnames.csv').iloc[np.argsort(model.feature_importances_)[::-1],:]
# c['scor']=model.feature_importances_[np.argsort(model.feature_importances_)[::-1]]
# #%%
# d=pd.read_csv('other_colnames.csv').iloc[6:1634,:].iloc[np.argsort(model.feature_importances_)[::-1],:]
# d['scor']=model.feature_importances_[np.argsort(model.feature_importances_)[::-1]]

# #%%
# a.to_csv("a.csv")
# c.to_csv("c.csv")
# d.to_csv("d.csv")

a=pd.read_csv("a.csv")
d=pd.read_csv("d.csv")
c=pd.read_csv("c.csv")

In [5]:
df=scipy.sparse.load_npz("title_train_csr.npz")
test=scipy.sparse.load_npz("title_test_csr.npz")
ae=df[:,a['Unnamed: 0'][:15]]
aet=test[:,a['Unnamed: 0'][:15]]

df=scipy.sparse.load_npz("topic_train_csr.npz")
test=scipy.sparse.load_npz("topic_test_csr.npz")
ce=df[:,c['Unnamed: 0'][:15]]
cet=test[:,c['Unnamed: 0'][:15]]

df=scipy.sparse.load_npz("csr_train_other.npz")[:,6:1634]
test=scipy.sparse.load_npz("csr_test_other.npz")[:,6:1634]
de=df[:,d['Unnamed: 0'][:20]-6]
det=test[:,d['Unnamed: 0'][:20]-6]

e=csr_matrix(scipy.sparse.hstack([scipy.sparse.load_npz("csr_train_othere.npz")[:,:12],scipy.sparse.load_npz("csr_train_other.npz")[:,1640:]]))
et=csr_matrix(scipy.sparse.hstack([scipy.sparse.load_npz("csr_test_othere.npz")[:,:12],scipy.sparse.load_npz("csr_test_other.npz")[:,1640:]]))

hsin=csr_matrix(pd.read_csv("dominic_train.csv",index_col=False).values)
hsint=csr_matrix(pd.read_csv("dominic_test.csv",index_col=False).values)

lda=csr_matrix(pd.read_csv("LDA_train.csv",index_col=False).iloc[:,1:])
ldat=csr_matrix(pd.read_csv("LDA_test.csv",index_col=False).iloc[:,1:])


df=scipy.sparse.hstack([ae,ce,de,e,hsin,lda])
df=csr_matrix(df)

test=scipy.sparse.hstack([aet,cet,det,et,hsint,ldat])
test=csr_matrix(test)







In [6]:
 parameters = {'boosting_type': ['gbdt'],
 'colsample_bytree': [0.4],
 'learning_rate':[0.007],
 'max_depth':[-1],
 'n_estimators': [1000],
 'num_leaves': [46],
 'objective': ['binary'],
 'reg_alpha': [0.002],
 'reg_lambda': [0.5],
 'subsample': [1],
 'min_child_samples':[5],
 'min_child_weight':[1e-3],
 'importance_type':['gain']
 }



In [7]:
clf =  GridSearchCV(lgb.LGBMClassifier(),
                                parameters,
                                cv = 3,
                                scoring = 'roc_auc',
                                n_jobs = -1,
                                verbose = 10,
                                refit=False)
clf.fit(df,y)

print(clf.best_score_)
clf.best_params_


Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
0.6013496659921974
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    7.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    7.7s finished


{'boosting_type': 'gbdt',
 'colsample_bytree': 0.4,
 'importance_type': 'gain',
 'learning_rate': 0.007,
 'max_depth': -1,
 'min_child_samples': 5,
 'min_child_weight': 0.001,
 'n_estimators': 1000,
 'num_leaves': 46,
 'objective': 'binary',
 'reg_alpha': 0.002,
 'reg_lambda': 0.5,
 'subsample': 1}

In [8]:
model = lgb.LGBMClassifier(**clf.best_params_)

model.fit(df,y)

print(sum(model.feature_importances_==0))
sum(model.feature_importances_!=0)

15


131

In [9]:
df=df[:,model.feature_importances_!=0]
test=test[:,model.feature_importances_!=0]



In [10]:
 parameters = {'boosting_type': ['gbdt'],
 'colsample_bytree': [0.4],
 'learning_rate':[0.0065],
 'max_depth':[-1],
 'n_estimators': [1000],
 'num_leaves': [46],
 'objective': ['binary'],
 'reg_alpha': [0.002],
 'reg_lambda': [0.6],
 'subsample': [1],
 'min_child_samples':[5],
 'min_child_weight':[1e-3],
 'importance_type':['gain']
 }



In [11]:
clf =  GridSearchCV(lgb.LGBMClassifier(),
                                parameters,
                                cv = 3,
                                scoring = 'roc_auc',
                                n_jobs = -1,
                                verbose = 10,
                                refit=False)
clf.fit(df,y)

print(clf.best_score_)
clf.best_params_


Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
0.6015572108107315
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    7.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    7.3s finished


{'boosting_type': 'gbdt',
 'colsample_bytree': 0.4,
 'importance_type': 'gain',
 'learning_rate': 0.0065,
 'max_depth': -1,
 'min_child_samples': 5,
 'min_child_weight': 0.001,
 'n_estimators': 1000,
 'num_leaves': 46,
 'objective': 'binary',
 'reg_alpha': 0.002,
 'reg_lambda': 0.6,
 'subsample': 1}

In [12]:
model = lgb.LGBMClassifier(**clf.best_params_)

model.fit(df,y)

print(sum(model.feature_importances_==0))
sum(model.feature_importances_!=0)






1


130

In [13]:
ans = model.predict_proba(test)[:,1]
pred_csv= np.zeros((testdata.shape[0],2))
pred_csv = pd.DataFrame(pred_csv)
pred_csv.columns = ['Id','Popularity']
pred_csv['Id'] = testID
pred_csv['Popularity'] = ans
pd.DataFrame(pred_csv).to_csv('y_pred_chyjj1.csv',index=False,header=True)





# 三、建構模型

# 挑選完變數後，我們再使用LightGBM模型，首先調整reg_lambda參數，觀察feature importances的Solution Path去挑選變數，再針對不同reg_lambda挑選到的變數重新去配適LightGBM模型，並調整各式參數，最後選擇cross validation score最高的模型，即為我們本次Kaggle上private score最高的模型。

In [None]:
#%%
import scipy as sp
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV,KFold
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.metrics import roc_auc_score
#%%
df_x = pd.read_csv("C:\\Users\\linre\\Desktop\\DL_HW\\DL_competition_1\\train.csv")
df_raw_test = pd.read_csv("C:\\Users\\linre\\Desktop\\DL_HW\\DL_competition_1\\test.csv")
train_y = df_x["Popularity"].values
sp_train_x = sp.sparse.load_npz("C:\\Users\\linre\\Desktop\\DL_HW\\DL_competition_1\\new_data\\train_15_20-15.npz")
sp_test_x = sp.sparse.load_npz("C:\\Users\\linre\\Desktop\\DL_HW\\DL_competition_1\\new_data\\test_15_20-15.npz")

parameters = {
 'colsample_bytree': [0.3],#0.2~0.5
 'learning_rate':[0.006],#0.0045~0.0075
 'n_estimators': [1000],
 'num_leaves': [46],#43~55
 'objective': ['binary'],
 'reg_alpha': [0.001],#0.0008~0.1
 'reg_lambda': [0.3],#0.1~0.5
 'min_child_samples':[5],#4~10
 'min_child_weight':[1e-3],#1e-(2,3,4,5)
 'importance_type':['gain']
}

clf =  GridSearchCV(lgb.LGBMClassifier(),
                                parameters,
                                cv = 3,
                                scoring = 'roc_auc',
                                n_jobs = 3,
                                verbose = 10,
                                refit=False)

clf.fit(sp_train_x,train_y)

print(clf.best_score_)

model = lgb.LGBMClassifier(**clf.best_params_)
model.fit(df,y)
model.feature_importances_



#%%

num_of_zero = []
auc = []
importance = [] 

for acd_lambda in np.arange(8,21,1):
    parameters = {
    'colsample_bytree': [0.3,0.5],#0.2~0.5
    'learning_rate':[0.006],#0.0045~0.0075
    'n_estimators': [1000],
    'num_leaves': [43,48,53],#43~55
    'objective': ['binary'],
    'reg_alpha': [0.001],#0.0008~0.1
    'reg_lambda': [acd_lambda],#0.1~0.5
    'min_child_samples':[4,8],#4~10
    'min_child_weight':[1e-5,1e-3,1e-4],#1e-(2,3,4,5)
    'importance_type':['gain']
    }
    clf =  GridSearchCV(lgb.LGBMClassifier(),
                                parameters,
                                cv = 3,
                                scoring = 'roc_auc',
                                n_jobs = 3,
                                verbose = 0,
                                refit=False)
    clf.fit(sp_train_x,train_y)
    #AUC
    auc.append(clf.best_score_)
    model = lgb.LGBMClassifier(**clf.best_params_)
    model.fit(sp_train_x,train_y)
    # importance 
    importance.append(model.feature_importances_)
    # num of zero 
    num_of_zero.append(sum(model.feature_importances_==0))
    print("done")


outcome_frame  = pd.concat([pd.DataFrame(num_of_zero),pd.DataFrame(auc),pd.DataFrame(importance)],axis = 1)
outcome_frame.to_csv("C:\\Users\\linre\\Desktop\\DL_HW\\DL_competition_1\\variable selection\\selection_Frame.csv")
# %%
# 16 zero 

zero_pos = [i for i in np.where(importance[0]==0)[0]]

train_x_16 =  pd.DataFrame(sp_train_x.toarray()).drop(zero_pos,axis=1)
test_x_16 = pd.DataFrame(sp_test_x.toarray()).drop(zero_pos,axis=1)
sp_train_x_16 = csr_matrix(train_x_16)
sp_test_x_16 = csr_matrix(test_x_16)
#%%   cv 0.6017177514039636
parameters = {
    'colsample_bytree': [0.4],#0.2~0.5
    'learning_rate':[0.004],#0.0045~0.0075
    'n_estimators': [1000],
    'num_leaves': [55],#43~55
    'objective': ['binary'],
    'reg_alpha': [0.0008],#0.0008~0.1
    'reg_lambda': [0.25],#0.1~0.5
    'min_child_samples':[4,6],#4~10
    'min_child_weight':[1e-5],#1e-(2,3,4,5)
    'importance_type':['gain']
    }
clf =  GridSearchCV(lgb.LGBMClassifier(),
                            parameters,
                            cv = 3,
                            scoring = 'roc_auc',
                            n_jobs = 3,
                            verbose = 10,
                            refit=False)
clf.fit(sp_train_x_16,train_y)
print(clf.best_params_)
print(clf.best_score_)
model = lgb.LGBMClassifier(**clf.best_params_)
model.fit(sp_train_x_16,train_y)
model.feature_importances_
# %%
# 17 zero 01
zero_pos_17_1 = [i for i in np.where(importance[1]==0)[0]]

train_x_17_1 =  pd.DataFrame(sp_train_x.toarray()).drop(zero_pos_17_1,axis=1)
test_x_17_1 = pd.DataFrame(sp_test_x.toarray()).drop(zero_pos_17_1,axis=1)
sp_train_x_17_1 = csr_matrix(train_x_17_1)
sp_test_x_17_1 = csr_matrix(test_x_17_1)
#%%
#%%   cv 0.6022852284061436
parameters = {
    'colsample_bytree': [0.5],#0.2~0.5
    'learning_rate':[0.004],#0.0045~0.0075
    'n_estimators': [1000],
    'num_leaves': [57],#43~55
    'objective': ['binary'],
    'reg_alpha': [0.001],#0.0008~0.1
    'reg_lambda': [0.45],#0.1~0.5
    'min_child_samples':[3],#4~10
    'min_child_weight':[1e-5],#1e-(2,3,4,5)
    'importance_type':['gain']
    }
clf =  GridSearchCV(lgb.LGBMClassifier(),
                            parameters,
                            cv = 3,
                            scoring = 'roc_auc',
                            n_jobs = 3,
                            verbose = 10,
                            refit=False)
clf.fit(train_x_17_1,train_y)
clf.best_params_
print(clf.best_score_)
model = lgb.LGBMClassifier(**clf.best_params_)
model.fit(train_x_17_1,train_y)
test_pred_outcome=model.predict_proba(sp_test_x_17_1)[:,1]
# %%
# 17 zero 02
zero_pos_17_2 = [i for i in np.where(importance[2]==0)[0]]

train_x_17_2 =  pd.DataFrame(sp_train_x.toarray()).drop(zero_pos_17_2,axis=1)
test_x_17_2 = pd.DataFrame(sp_test_x.toarray()).drop(zero_pos_17_2,axis=1)
sp_train_x_17_2 = csr_matrix(train_x_17_2)
sp_test_x_17_2 = csr_matrix(test_x_17_2)
# %% 0.6023221804589858
parameters = {
    'colsample_bytree': [0.5],#0.2~0.5
    'learning_rate':[0.004],#0.0045~0.0075
    'n_estimators': [800],
    'num_leaves': [57],#43~55
    'objective': ['binary'],
    'reg_alpha': [0.001],#0.0008~0.1
    'reg_lambda': [0.35],#0.1~0.5
    'min_child_samples':[3],#4~10
    'min_child_weight':[1e-5],#1e-(2,3,4,5)
    'importance_type':['gain']
    }
clf =  GridSearchCV(lgb.LGBMClassifier(),
                            parameters,
                            cv = 3,
                            scoring = 'roc_auc',
                            n_jobs = 3,
                            verbose = 10,
                            refit=False)
clf.fit(train_x_17_2,train_y)
clf.best_params_
print(clf.best_score_)
model = lgb.LGBMClassifier(**clf.best_params_)
model.fit(train_x_17_2,train_y)
model.feature_importances_

# %% 21 zero
zero_pos_21 = [i for i in np.where(importance[3]==0)[0]]

train_x_21 =  pd.DataFrame(sp_train_x.toarray()).drop(zero_pos_21,axis=1)
test_x_21 = pd.DataFrame(sp_test_x.toarray()).drop(zero_pos_21,axis=1)
sp_train_x_21 = csr_matrix(train_x_21)
sp_test_x_21 = csr_matrix(test_x_21)
# %% 0.6027639485565163
parameters = {
    'colsample_bytree': [0.5],#0.2~0.5
    'learning_rate':[0.004],#0.0045~0.0075
    'n_estimators': [800],
    'num_leaves': [57],#43~55
    'objective': ['binary'],
    'reg_alpha': [0.01],#0.0008~0.1
    'reg_lambda': [0.045],#0.1~0.5
    'min_child_samples':[3],#4~10
    'min_child_weight':[1e-5],#1e-(2,3,4,5)
    'importance_type':['gain']
    }
clf =  GridSearchCV(lgb.LGBMClassifier(),
                            parameters,
                            cv = 3,
                            scoring = 'roc_auc',
                            n_jobs = 3,
                            verbose = 10,
                            refit=False)
clf.fit(train_x_21,train_y)
clf.best_params_
print(clf.best_score_)
model = lgb.LGBMClassifier(**clf.best_params_)
model.fit(train_x_21,train_y)
model.feature_importances_
test_pred_outcome=model.predict_proba(test_x_21)[:,1]
# %% 18 zero
zero_pos_18 = [i for i in np.where(importance[4]==0)[0]]

train_x_18 =  pd.DataFrame(sp_train_x.toarray()).drop(zero_pos_18,axis=1)
test_x_18 = pd.DataFrame(sp_test_x.toarray()).drop(zero_pos_18,axis=1)
sp_train_x_18 = csr_matrix(train_x_18)
sp_test_x_18 = csr_matrix(test_x_18)
#%% 0.6020049558034137
parameters = {
    'colsample_bytree': [0.5],#0.2~0.5
    'learning_rate':[0.004],#0.0045~0.0075
    'n_estimators': [800],
    'num_leaves': [46],#43~55
    'objective': ['binary'],
    'reg_alpha': [0.01],#0.0008~0.1
    'reg_lambda': [0.3],#0.1~0.5
    'min_child_samples':[3],#4~10
    'min_child_weight':[1e-5],#1e-(2,3,4,5)
    'importance_type':['gain']
    }
clf =  GridSearchCV(lgb.LGBMClassifier(),
                            parameters,
                            cv = 3,
                            scoring = 'roc_auc',
                            n_jobs = 3,
                            verbose = 10,
                            refit=False)
clf.fit(train_x_18,train_y)
clf.best_params_
print(clf.best_score_)
model = lgb.LGBMClassifier(**clf.best_params_)
model.fit(train_x_18,train_y)
model.feature_importances_

# %%
num_of_zero_2 = []
auc_2 = []
importance_2 = [] 

for acd_lambda in np.arange(21,24,3):
    parameters = {
    'colsample_bytree': [0.3,0.5],#0.2~0.5
    'learning_rate':[0.006],#0.0045~0.0075
    'n_estimators': [1000],
    'num_leaves': [43,48,53],#43~55
    'objective': ['binary'],
    'reg_alpha': [0.001],#0.0008~0.1
    'reg_lambda': [acd_lambda],#0.1~0.5
    'min_child_samples':[4,8],#4~10
    'min_child_weight':[1e-5,1e-3,1e-4],#1e-(2,3,4,5)
    'importance_type':['gain']
    }
    clf_2 =  GridSearchCV(lgb.LGBMClassifier(),
                                parameters,
                                cv = 3,
                                scoring = 'roc_auc',
                                n_jobs = 3,
                                verbose = 0,
                                refit=False)
    clf_2.fit(sp_train_x,train_y)
    #AUC
    auc_2.append(clf_2.best_score_)
    model_2 = lgb.LGBMClassifier(**clf_2.best_params_)
    model_2.fit(sp_train_x,train_y)
    # importance 
    importance_2.append(model_2.feature_importances_)
    # num of zero 
    num_of_zero_2.append(sum(model_2.feature_importances_==0))
    print("done")


outcome_frame_2  = pd.concat([pd.DataFrame(num_of_zero_2),pd.DataFrame(auc_2),pd.DataFrame(importance_2)],axis = 1)

# %%
outcome_frame_2.to_csv("C:\\Users\\linre\\Desktop\\DL_HW\\DL_competition_1\\variable selection\\selection_Frame_2.csv")
#%%
#test_pred_outcome = model.predict_proba(test_x_csr)[:,1]
test_pred = {
        'Id': df_raw_test['Id'],
        'Popularity': test_pred_outcome
    }
outcome = pd.DataFrame(test_pred)
pd.DataFrame.to_csv(outcome,"C:\\Users\\linre\\Desktop\\DL_HW\\DL_competition_1\\test_pred_514_v16.csv",index=False)



# %%


# 四、結論

## (一)、一開始在比賽時，是直接將文章內容做Bag-Of-Words，然而這樣做有２個缺點，第一個是多餘變數太多，對預測有不好的影響，造成上傳分數卡在0.57後就很難再上去，第二是design matrix 會變得太大，需要的記憶體空間不足及執行效率緩慢，因此最後改採自己定義feature為主，Bag-Of-Words為輔的方式處理feature。
## (二)、本次比賽，feature選擇的重要性高於模型選擇的重要性。無論feature太多或太少，對模型的預測皆有不好的影響，因此我們花了很多時間在挑選feature，挑選完畢後，最後在建模時，基本上只要參數調整適當，皆能有好的預測表現。