In [1]:
import csv, sys
from bs4 import BeautifulSoup
from datetime import datetime
from joblib import Parallel, delayed
import re
import pandas as pd

train_dataset_dir = './train/'
test_dataset_dir = './test/'

from os import listdir
from os.path import isfile, isdir, join

In [2]:
train_df = None
test_df = None
train_files = listdir(train_dataset_dir)
test_files = listdir(test_dataset_dir)

train_df = pd.read_csv(train_dataset_dir + 'title_len.csv')
test_df = pd.read_csv(test_dataset_dir + 'title_len.csv')

In [3]:
for f in train_files:
        s = f[:-4]
        temp_df = pd.read_csv(train_dataset_dir + s + '.csv', lineterminator='\n')
        train_df[s] = temp_df[s]

In [4]:
for f in test_files:
        s = f[:-4]
        temp_df = pd.read_csv(test_dataset_dir + s + '.csv', lineterminator='\n')
        test_df[s] = temp_df[s]

In [5]:
train_df['h2'] = train_df['h2'].fillna('')
test_df['h2'] = test_df['h2'].fillna('')
test_df['article'] = test_df['article'].fillna('')
train_df['author'] = train_df['author'].fillna('')
test_df['author'] = test_df['author'].fillna('')
train_df['title'] = train_df['title'].fillna('')
test_df['title'] = test_df['title'].fillna('')
train_df['h1'] = train_df['h1'].fillna('') + train_df['author']
test_df['h1'] = test_df['h1'].fillna('') + test_df['author']
train_df['topics'] = train_df['topics'].fillna('')
test_df['topics'] = test_df['topics'].fillna('')

In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [7]:
class AuthorTransFormer(BaseEstimator,TransformerMixin):
    def __init__(self, gate = 5):
        self.author_pop_dict = {}
        self.author_pop_mean = 0
        self.gate = 5
        
    def get_author_pop(self, author_arr):
        total_mean = []
        for i in author_arr:
            if i in self.author_pop_dict.keys():
                total_mean.append(self.author_pop_dict[i])
            else:
                total_mean.append(self.author_pop_mean)
        return total_mean

    def get_train_author_pop(self, author_arr, label_arr):
        popularity = {}
        for author, label in zip(author_arr, label_arr):
            if author in popularity.keys():
                popularity[author].append(int(label))
            else:
                popularity[author] = [int(label)]

        delete_keys = []
        for i in popularity.keys():
            if(len(popularity[i]) < self.gate):
                delete_keys.append(i)
            else:
                popularity[i] = sum(popularity[i]) / len(popularity[i])
                
        for j in delete_keys:
            popularity.pop(j, None)
        total_mean = []
        cnt = 0
        for i in author_arr:
            if i in popularity.keys():
                total_mean.append(popularity[i])
            else:
                cnt = cnt + 1
                total_mean.append(-1)

        mean = (sum(total_mean) + cnt)/ (len(total_mean) - cnt)
        return popularity, mean


    def fit(self,X,y=None):
        self.author_pop_dict, self.author_pop_mean = self.get_train_author_pop(X['author'], X['label'])
        return self
    
    def transform(self,X,y=None):
        X['popularity'] = self.get_author_pop(X['author'])
        return X

In [8]:
class ChannelTransFormer(BaseEstimator,TransformerMixin):
    def __init__(self, gate = 5):
        self.author_pop_dict = {}
        self.author_pop_mean = 0
        self.gate = 5
        
    def get_author_pop(self, author_arr):
        total_mean = []
        for i in author_arr:
            if i in self.author_pop_dict.keys():
                total_mean.append(self.author_pop_dict[i])
            else:
                total_mean.append(self.author_pop_mean)
        return total_mean

    def get_train_author_pop(self, author_arr, label_arr):
        popularity = {}
        for author, label in zip(author_arr, label_arr):
            if author in popularity.keys():
                popularity[author].append(int(label))
            else:
                popularity[author] = [int(label)]

        delete_keys = []
        for i in popularity.keys():
            if(len(popularity[i]) < self.gate):
                delete_keys.append(i)
            else:
                popularity[i] = sum(popularity[i]) / len(popularity[i])
                
        for j in delete_keys:
            popularity.pop(j, None)
        total_mean = []
        cnt = 0
        for i in author_arr:
            if i in popularity.keys():
                total_mean.append(popularity[i])
            else:
                cnt = cnt + 1
                total_mean.append(-1)

        mean = (sum(total_mean) + cnt)/ (len(total_mean) - cnt)
        return popularity, mean


    def fit(self,X,y=None):
        self.author_pop_dict, self.author_pop_mean = self.get_train_author_pop(X['channel'], X['label'])
        return self
    
    def transform(self,X,y=None):
        X['channel_pop'] = self.get_author_pop(X['channel'])
        return X

In [9]:
class AuthorTopicsPopTransFormer(BaseEstimator,TransformerMixin):
    def __init__(self, gate = 5, tps_list = []):
        self.author_pop_dict = {}
        self.author_pop_mean = 0
        self.gate = 5
        self.df = None
        self.tps_list = tps_list
        
    def get_author_channel_pop(self, channel_arr, author_arr, author_pop_arr):
        arr = []
        len_arr = []
        for author, channel, author_pop in zip(author_arr, channel_arr, author_pop_arr):
            author_channel_df = self.df.loc[self.df['author'] == author]
            author_channel_df = author_channel_df.loc[author_channel_df['choose_tps'] == channel] 
            if (len(author_channel_df) < self.gate) or (channel not in self.tps_list):
                arr.append(author_pop)
                len_arr.append(len(author_channel_df))
            else:
                positive = len(author_channel_df.loc[author_channel_df['label'] == 1])
                total = len(author_channel_df)
                arr.append(positive/total)
                len_arr.append(total)
        return arr, len_arr
    
    def fit(self,X,y=None):
        self.df = X.loc[:, ['author', 'choose_tps', 'label']] 
        return self
    
    def transform(self,X,y=None):
        X['author_tps_pop'],  X['author_tps_count'] = self.get_author_channel_pop(X['choose_tps'],  X['author'],  X['popularity'])
        if 'label' in X.columns:
            X= X.drop(['author','choose_tps', 'label'], axis=1)
        else:
            X= X.drop(['author', 'choose_tps'], axis=1)
        return X

In [10]:
class TopicsTransFormer(BaseEstimator,TransformerMixin):
    def __init__(self, gate = 5, tps_list = []):
        self.topics_pop_dict = {}
        self.topics_pop_mean = 0
        self.gate = 5
        self.tps_list = tps_list
        
    def get_topics_pop(self,topics_arr):
        total_mean = []
        choose_topic = []
        for topics in topics_arr :
            tps_val = -1
            choose_tps = ''
            for tps in topics.split():
                if tps in self.topics_pop_dict.keys():
                    if tps_val < self.topics_pop_dict[tps]:
                        tps_val = self.topics_pop_dict[tps]
                        choose_tps = tps
            if topics == '' or choose_tps == '':
                total_mean.append(self.topics_pop_mean)
                choose_topic.append('NONE')
            else:
                total_mean.append(self.topics_pop_dict[choose_tps])
                choose_topic.append(choose_tps)
        return total_mean, choose_topic

    def get_train_topics_pop(self, topics_arr, label_arr):
        popularity = {}
        for i in self.tps_list:
            popularity[i] = []
        for topics, label in zip(topics_arr, label_arr):
            for tps in topics.split():
                if tps in popularity.keys():
                    popularity[tps].append(int(label))
        for i in popularity.keys():
            if len(popularity[i]) != 0:
                popularity[i] = sum(popularity[i]) / len(popularity[i])
            else: 
                print(i)
        
        total_mean = []
        cnt_non_tps = 0
        for topics in topics_arr:
            tps_val = -1
            choose_tps = ''
            for tps in topics.split():
                if tps not in popularity.keys():
                    continue
                if tps_val < popularity[tps]:
                    tps_val = popularity[tps]
                    choose_tps = tps
            if topics == '' or choose_tps == '':
                cnt_non_tps += 1
                total_mean.append(-1)
            else:
                total_mean.append(popularity[choose_tps])

        mean = (sum(total_mean) + cnt_non_tps) / (len(total_mean) - cnt_non_tps)
        return popularity, mean


    def fit(self,X,y=None):
        self.topics_pop_dict, self.topics_pop_mean = self.get_train_topics_pop(X['topics'].fillna(""), X['label'])
        return self
    
    def transform(self,X,y=None):
        X['topics_pop'] , X['choose_tps'] = self.get_topics_pop(X['topics'].fillna(""))
        X= X.drop(['topics'], axis=1)
        return X

In [11]:
class AuthorChannelPopTransFormer(BaseEstimator,TransformerMixin):
    def __init__(self, gate = 5):
        self.author_channel_pop_dict = {}
        self.author_channel_pop_mean = 0
        self.author_channel_cnt_dict = {}
        self.author_channel_cnt_mean = 0
        self.gate = 5
        self.df = None
        
    def get_author_channel_pop(self, channel_arr, author_arr):
        arr = []
        len_arr = []
        for author, channel in zip(author_arr, channel_arr):
            if (author, channel) not in self.author_channel_pop_dict.keys():
                arr.append(self.author_channel_pop_mean)
                len_arr.append(self.author_channel_cnt_mean)
            else:
                arr.append(self.author_channel_pop_dict[(author, channel)])
                len_arr.append(self.author_channel_cnt_dict[(author, channel)])
        return arr, len_arr
    
    def get_train_author_channel_pop(self, channel_arr, author_arr, label_arr):
        popularity = {}
        for author, channel, label in zip(author_arr, channel_arr, label_arr):
            if (author, channel) in popularity.keys():
                popularity[(author, channel)].append(int(label))
            else:
                popularity[(author, channel)] = [int(label)]
            

        delete_keys = []
        cnt_mean = []
        cnt_dict = {}
        for i in popularity.keys():
            if len(popularity[i]) < self.gate:
                delete_keys.append(i)
            else:
                cnt_dict[i] = len(popularity[i])
                cnt_mean.append(len(popularity[i]))
                popularity[i] = sum(popularity[i]) / len(popularity[i])
        for j in delete_keys:
            popularity.pop(j, None)

        total_mean = []
        cnt_non_author_channel = 0
        mean_cnt = 0
        for author, channel in zip(author_arr, channel_arr):
            if (author, channel) in popularity.keys():
                total_mean.append(popularity[(author, channel)])
            else:
                cnt_non_author_channel += 1
                total_mean.append(-1)
        
        mean = (sum(total_mean) + cnt_non_author_channel) / (len(total_mean) - cnt_non_author_channel)
        mean_cnt = sum(cnt_mean) / len(cnt_mean)
        return popularity, mean, cnt_dict, mean_cnt
    
    def fit(self,X,y=None):
        self.author_channel_pop_dict, self.author_channel_pop_mean, self.author_channel_cnt_dict, self.author_channel_cnt_mean = self.get_train_author_channel_pop(X['channel'], X['author'], X['label'])
        return self
    
    def transform(self,X,y=None):
        X['author_channel_pop'],  X['author_channel_count'] = self.get_author_channel_pop(X['channel'], X['author'])
        X= X.drop(['channel'], axis=1)
        return X

In [12]:
topic_dic  = ['apps', 'facebook', 'design', 'photography', 'youtube', 'google', 'twitter', 'family', 'television', 'gallery', 'movies', 'ukraine', 'search', 'thrones', 'contributor', 'glass', 'humor', 'space', 'health', 'netflix', 'wars', 'east', 'linkedin', 'climate', 'cup', 'memes', 'reddit', 'winter', 'smartphone', 'vine', 'spotify', 'holiday', 'celebrities', 'yahoo', 'gmail', 'snapchat', 'diy', 'videos', 'launchpad', 'choice', 'earth', '2014', '360', 'chrome', 'shopping', 'california', 'tablet', 'small', 'ebay', 'game', 'house', 'instagram', 'holidays', 'web', 'snow', 'streaming', 'cyrus', 'late', 'lego', 'week', 'doctor', 'motorola', 'sxsw', 'reviews', 'parenting', 'nostalgia', 'rift', 'infographics', 'pandora', "valentine's", 'freedom', 'emmys', 'doodle', 'roundup', 'reality', 'angry', 'healthcare', 'snowden', 'gifs', 'valley', 'digital', 'beer', 'mad', 'motion', 'marathon', 'security', 'cover', 'journalism', 'features', 'bill', 'gates', 'parody', 'selfie', 'sunday', 'brands', 'beauty', 'star', 'toys', 'iphone', 'interviews', 'cases', 'solutions', 'america', 'augmented', 'human', 'now', 'recent', 'ibm', 'gift', 'crash', 'tourism', 'mars', 'electronics', 'green', 'williams', 'life', 'kickstarter', 'vimeo', 'man', 'academy', 'death', 'fashion', 'texting', 'tips', 'resume', 'xbox', 'storage', 'touch', 'kids', 'olympic', 'facts', 'siri', 'spring', 'at&t', 'hbo', 'forum', 'kingdom', 'nest', 'south', 'arrested', 'iwatch', 'los', 'anonymous', 'germany', 'opinion', 'use', 'scrivan', 'european', 'email', 'travel', 'venezuela', 'smartwatch', 'moon', 'international', 'police', 'jerry', 'costumes', 'history', 'psychology', 'congress', 'cyberbullying', 'pope', 'awards', 'lawsuit', 'hacks', 'study', 'drone', 'smart', 'easter', 'ferguson', 'ted', 'width', 'cbs', 'dna', 'management', 'teens', 'olympics', 'safety', 'stem', 'yelp', 'buzzwords', 'superheroes', 'legos', 'brazil', 'cable', 'disease', 'campaign', 'paris', 'perry', 'battery', 'crafts', 'gold', 'new', 'printers', 'iron', 'bad', 'pew', 'crime', 'news', 'online', 'korea', 'art', 'extreme', 'waze', 'fox', 'dogs', 'texas', 'retina', 'seinfeld', 'hangouts', 's5', 'cute', 'box', 'future', 'wikipedia', 'team', 'animated', 'jose', 'mashups', 'living', 'ocean', 'productivity', 'true', 'voice', 'speech', 'sam', 'fov', 'password', 'thanksgiving', 'seo', 'beat', 'deaths', 'internet', 'hashtags', 'conan', 'franco', 'smartwatches', 'philippines', 'graph', 'shazam', 'recalls', 'cameras', 'hubble', 'content', 'law', 'muppets', 'wireless', 'alibaba', 'fiber', 'browser', 'metrics', 'zelda', 'oliver', 'students', 'station', 'watson', 'printing', 'rss']

In [13]:
def AuthorProblem(DF, mode):
    if mode == 'train':
        new_df = pd.DataFrame([], columns = DF.columns)
        for index, row in DF.iterrows(): 
            morethanone = False
            for i in range(len(row['author'].split())):
                if(row['author'].find(',')):
                    a = row['author'].find(',')
                    row['author'] = row['author'][:a] + " " + row['author'][a:]
                if row['author'].split()[i] == 'and' or row['author'].split()[i] == '&' or row['author'].split()[i] == ',':
                    morethanone = True
                    a = row.copy(deep = True)
                    if(i + 2 < len(row['author'].split())):
                        a.loc['author'] = row['author'].split()[i+1] + ' ' + row['author'].split()[i+2]
                        new_df = new_df.append(a)
        DF = pd.concat([DF, new_df], ignore_index=True)
    elif mode == 'test':
        for index, row in DF.iterrows():
            morethanone = False
            for i in range(len(row['author'].split())):
                if(row['author'].find(',')):
                    a = row['author'].find(',')
                    row['author'] = row['author'][:a] + " " + row['author'][a:]
                if row['author'].split()[i] == 'and' or row['author'].split()[i] == '&' or row['author'].split()[i] == ',':
                    morethanone = True
                    break
            if(morethanone):
                if len(row['author'].split()) > 1:
                    DF.loc[index,'author'] = row['author'].split()[0] + ' ' + row['author'].split()[1]
    return DF

In [14]:
train_df_solve = AuthorProblem(train_df, 'test')

In [15]:
X_train = train_df_solve[['dayhour', 'author_post', 'links', 'popularity', 'article_len', 'topics', 'channel', 'author', 'label']]
X_train, X_val, y_train, y_val = train_test_split(X_train, X_train['label'], test_size=0.3, random_state=30)

In [44]:
from xgboost import XGBClassifier
pipe1 = Pipeline([('author', AuthorTransFormer(gate = 2)),
                  ('channel', ChannelTransFormer(gate = 2)),
                  ('author_channel', AuthorChannelPopTransFormer(gate = 2)),
                  ('topics', TopicsTransFormer(tps_list = topic_dic)),
                  ('topics_author', AuthorTopicsPopTransFormer(tps_list = topic_dic)),
                  ('std', StandardScaler()),
                  ("pca", PCA()),
                  ('clf', XGBClassifier(max_depth=5,learning_rate=0.001,objective="binary:logistic", eval_metric="error"))])

pipe1.fit(X_train, y_train)
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_val, pipe1.predict_proba(X_val)[:,1])
print(score)
pickle.dump(pipe1, open('featuresonly.pkl', 'wb'))



0.5486324401332469



================================== TEXT ===================================


In [16]:
import re
from bs4 import BeautifulSoup
def preprocessor(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def tokenizer(text):
    return re.split('\s+', text.strip())

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stanl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# TF-IDF Fit
Xtrain, Xval, y_train, y_val = train_test_split(train_df, train_df['label'], test_size=0.3, random_state=30)
doc_words = pd.concat([Xtrain['article'], Xtrain['h1'], Xtrain['title'], Xtrain['h2']])
tfidf_train = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
tfidf_train_channel = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
tfidf_train_channel_topics = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
tfidf_train.fit(doc_words)



NameError: name 'tfidf_Xtrain' is not defined

In [19]:
tfidf_train_channel.fit(Xtrain['channel'])
tfidf_train_channel_topics.fit(pd.concat([Xtrain['channel'], Xtrain['topics']]))



TfidfVectorizer(preprocessor=<function preprocessor at 0x0000025D4F50DD38>,
                tokenizer=<function tokenizer_stem_nostop at 0x0000025D38E83F78>)

In [20]:
# TF-IDF Transform
text_feature = ['article', 'h1', 'h2']
training_sparse = []
val_sparse = []
for i in text_feature:
    training_sparse.append(tfidf_train.transform(Xtrain[i]))
    val_sparse.append(tfidf_train.transform(Xval[i]))
training_sparse.append(tfidf_train_channel.transform(Xtrain['channel']))
val_sparse.append(tfidf_train_channel.transform(Xval['channel']))
training_sparse.append(tfidf_train_channel_topics.transform(Xtrain['channel']))
val_sparse.append(tfidf_train_channel_topics.transform(Xval['channel']))
training_sparse.append(tfidf_train_channel_topics.transform(Xtrain['topics']))
val_sparse.append(tfidf_train_channel_topics.transform(Xval['topics']))

In [50]:
for i in training_sparse:
    print(i.shape)

(19350, 34)
(19350, 9409)
(19350, 9409)


In [21]:
from scipy import sparse
training_sparsematrix = sparse.hstack((training_sparse[0], training_sparse[1]))
training_sparsematrix = sparse.hstack((training_sparsematrix, training_sparse[2]))
training_sparsematrix2 = sparse.hstack((training_sparsematrix, training_sparse[4]))
training_sparsematrix2 = sparse.hstack((training_sparsematrix2, training_sparse[5]))
training_sparsematrix = sparse.hstack((training_sparsematrix, training_sparse[3]))
training_sparse[0].shape
training_sparsematrix.shape
training_sparsematrix2.shape
val_sparsematrix = sparse.hstack((val_sparse[0], val_sparse[1]))
val_sparsematrix = sparse.hstack((val_sparsematrix, val_sparse[2]))
val_sparsematrix2 = sparse.hstack((val_sparsematrix, val_sparse[4]))
val_sparsematrix2 = sparse.hstack((val_sparsematrix2, val_sparse[5]))
val_sparsematrix = sparse.hstack((val_sparsematrix, val_sparse[3]))

In [23]:
import pickle
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
bag3 = BaggingClassifier(base_estimator=MultinomialNB(), n_estimators=500, 
                        max_samples=0.6, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1, random_state=1)
bag3.fit(training_sparsematrix2, y_train)
score = roc_auc_score(y_val, bag3.predict_proba(val_sparsematrix2)[:,1])
print(score)
pickle.dump(bag3, open('tfidfonly.pkl', 'wb'))

0.5545791344548965


In [21]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
bag = BaggingClassifier(base_estimator=MultinomialNB(), n_estimators=500, 
                        max_samples=0.8, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1)
bag.fit(train_sparsematrix, y_train)
score = roc_auc_score(y_val, bag.predict_proba(test_sparsematrix)[:,1])
print(score)

0.5503059972670918


In [22]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
bag2 = BaggingClassifier(base_estimator=MultinomialNB(), n_estimators=500, 
                        max_samples=0.8, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1, random_state=1)
bag2.fit(train_sparsematrix2, y_train)
score = roc_auc_score(y_val, bag2.predict_proba(test_sparsematrix2)[:,1])
print(score)

0.5539296613661283


In [24]:
bag.predict_proba(test_sparsematrix)[:,1]

array([0.45096409, 0.36608476, 0.57281428, ..., 0.50563726, 0.3408287 ,
       0.34089643])

=============================== DROPPING =================================

In [24]:
X_train = train_df_solve[['dayhour', 'author_post', 'links', 'popularity', 'article_len', 'topics', 'channel', 'author', 'label']]
X_train, X_val, y_train, y_val = train_test_split(X_train, X_train['label'], test_size=0.3, random_state=30)

In [26]:
from xgboost import XGBClassifier
pipe1 = Pipeline([('author', AuthorTransFormer(gate = 2)),
                  ('channel', ChannelTransFormer(gate = 2)),
                  ('author_channel', AuthorChannelPopTransFormer(gate = 2)),
                  ('topics', TopicsTransFormer(tps_list = topic_dic)),
                  ('topics_author', AuthorTopicsPopTransFormer(tps_list = topic_dic)),
                  ('std', StandardScaler()),
                  ("pca", PCA()),
                  ('clf', XGBClassifier(max_depth=5,learning_rate=0.001,objective="binary:logistic", eval_metric="error"))])

pipe1.fit(X_train, y_train)
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_val, pipe1.predict_proba(X_val)[:,1])
print(score)



0.5476343117331155


In [29]:
from xgboost import XGBClassifier
pipe2 = Pipeline([('author', AuthorTransFormer(gate = 2)),
                  ('channel', ChannelTransFormer(gate = 2)),
                  ('author_channel', AuthorChannelPopTransFormer(gate = 2)),
                  ('topics', TopicsTransFormer(tps_list = topic_dic)),
                  ('topics_author', AuthorTopicsPopTransFormer(tps_list = topic_dic)),
                  ('std', StandardScaler()),
                  ("pca", PCA()),
                  ('clf', XGBClassifier(max_depth=5,learning_rate=0.001,objective="binary:logistic", eval_metric="error"))])

pipe2.fit(X_train, y_train)
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_val, pipe2.predict_proba(X_val)[:,1])
print(score)



0.5486324401332469


In [30]:
from xgboost import XGBClassifier
pipeR = Pipeline([('author', AuthorTransFormer(gate = 2)),
                  ('channel', ChannelTransFormer(gate = 2)),
                  ('author_channel', AuthorChannelPopTransFormer(gate = 2)),
                  ('topics', TopicsTransFormer(tps_list = topic_dic)),
                  ('topics_author', AuthorTopicsPopTransFormer(tps_list = topic_dic)),
                  ('std', StandardScaler()),
                  ("pca", PCA()),
                  ('clf', RandomForestClassifier(n_estimators=200, random_state=30))])

pipeR.fit(X_train, y_train)
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_val, pipeR.predict_proba(X_val)[:,1])
print(score)

0.5383687866492542


In [31]:
from xgboost import XGBClassifier
pipeNP = Pipeline([('author', AuthorTransFormer(gate = 2)),
                  ('channel', ChannelTransFormer(gate = 2)),
                  ('author_channel', AuthorChannelPopTransFormer(gate = 2)),
                  ('topics', TopicsTransFormer(tps_list = topic_dic)),
                  ('topics_author', AuthorTopicsPopTransFormer(tps_list = topic_dic)),
                  ('std', StandardScaler()),
                  ('clf', XGBClassifier(max_depth=5,learning_rate=0.001,objective="binary:logistic", eval_metric="error"))])

pipeNP.fit(X_train, y_train)
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_val, pipeNP.predict_proba(X_val)[:,1])
print(score)



0.5465968424899728


=============================== SPARSE =================================

In [28]:
pipeR = Pipeline([('author', AuthorTransFormer(gate = 2)),
                  ('channel', ChannelTransFormer(gate = 2)),
                  ('author_channel', AuthorChannelPopTransFormer(gate = 2)),
                  ('topics', TopicsTransFormer(tps_list = topic_dic)),
                  ('topics_author', AuthorTopicsPopTransFormer(tps_list = topic_dic)),
                  ('std', StandardScaler())])

In [55]:
training_sparsematrix_both = pipeR.fit_transform(X_train)
training_sparsematrix_together = sparse.hstack((training_sparsematrix_both, training_sparsematrix))

In [60]:
print(training_sparsematrix_together)

[[ 7.61663950e-01 -1.16004486e+00 -3.55345294e-01 ... -5.59961820e-01
   4.19285454e-02 -5.64675354e-01]
 [ 1.21723403e+00  1.72861227e+00 -6.88063286e-03 ...  2.93439337e-14
  -4.33509082e-01  2.35463275e+00]
 [-1.66804313e+00 -2.11660442e-01 -6.16693790e-01 ... -6.54841133e-01
  -8.72606965e-02  1.88694479e-01]
 ...
 [ 2.38048596e-03  1.97275959e-01 -4.42461459e-01 ... -1.66082954e-01
  -1.90506472e+00 -2.35076052e-01]
 [ 4.57950564e-01 -3.18444359e-02 -7.03809955e-01 ... -3.12235285e-01
  -8.74975774e-01 -2.35076052e-01]
 [-1.97175652e+00 -7.51108460e-01 -6.16693790e-01 ...  2.93439337e-14
  -1.37992134e+00 -3.76332896e-01]]


In [29]:
val_sparsematrix_both = pipeR.transform(X_val)
val_sparsematrix_together = sparse.hstack((val_sparsematrix_both, val_sparsematrix))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


ValueError: could not convert string to float: 'Founders of Flipboard Evan Doll and Mike McCue essentially met on a blind date at a coffee shop. They instantly knew they wanted to build something together, but they weren\'t sure what.  On this episode of the Valley Girl Show, Doll shares that the duo tried quite a few things before settling on the right product. The initial idea came from trying to re-invent the web browser.  "With startups you can always get rid of the product and build another one, but it\'s harder to change the team," he says. Design is another important element that came into play. "The iPad is what crystallized everything for us," Doll adds. It didn\'t take them long to realize that it would enable people to peruse content in a magazine-like way.  With so much happening on social networks these days, Flipboard is one of the only content curation products that has clearly cut through the noise. It\'s not just news, it\'s personalized news. "We try to bring relevancy to it," says Doll.  Doll taught the first ever class on iPhone app development at Stanford University. As an added bonus in this episode, Doll teaches us to code.  Jesse Draper is creator and host of The Valley Girl Show, through which she\'s become a spokesperson for startups and helped pioneer the way of new media content distribution. Formerly a Nickelodeon star, Draper is now CEO of Valley Girl, where she oversees the show and runs technology blog Lalawag.com. More Video from the Valley Girl Show  Copious Is the Social Market Where Everyone Knows Your Name This Couple Owns the Online Bedding Space Stream Radio Stations From Around the World  Image courtesy of Valley Girl TV'

In [None]:
val_sparsematrix_both = np.where(val_sparsematrix_both<0, abs(val_sparsematrix_both), val_sparsematrix_both)
val_sparsematrix_together = sparse.hstack((val_sparsematrix_both, val_sparsematrix))
train_sparsematrix_both = np.where(train_sparsematrix_both<0, abs(train_sparsematrix_both), train_sparsematrix_both)
train_sparsematrix_together = sparse.hstack((train_sparsematrix_both, train_sparsematrix))

In [74]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
bag3 = BaggingClassifier(base_estimator=MultinomialNB(), n_estimators=500, 
                        max_samples=0.6, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1, random_state=1)
bag3.fit(train_sparsematrix_together, y_train)
score = roc_auc_score(y_val, bag3.predict_proba(test_sparsematrix_together)[:,1])
print(score)

0.55901877186734


In [34]:
training_sparsematrix_both = pipeR.fit_transform(X_train)
val_sparsematrix_both = pipeR.transform(X_val)
val_sparsematrix_both = np.where(val_sparsematrix_both<0, 0, val_sparsematrix_both)
val_sparsematrix_together = sparse.hstack((val_sparsematrix_both, val_sparsematrix))
training_sparsematrix_both = np.where(training_sparsematrix_both<0, 0, training_sparsematrix_both)
training_sparsematrix_together = sparse.hstack((training_sparsematrix_both, training_sparsematrix))

In [35]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
bag4 = BaggingClassifier(base_estimator=MultinomialNB(), n_estimators=500, 
                        max_samples=0.6, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1, random_state=1)
bag4.fit(training_sparsematrix_together, y_train)
score = roc_auc_score(y_val, bag4.predict_proba(val_sparsematrix_together)[:,1])
print(score)


0.558858731455681


In [31]:
import pickle
pickle.dump(bag4, open('sparseMNB.pkl', 'wb'))

In [32]:
loaded_model = pickle.load(open('sparseMNB.pkl', 'rb'))
score = roc_auc_score(y_val, loaded_model.predict_proba(test_sparsematrix_together)[:,1])
print(score)

0.6228782133204835


In [33]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
bag4 = BaggingClassifier(base_estimator=MultinomialNB(), n_estimators=500, 
                        max_samples=0.8, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1, random_state=1)
bag4.fit(train_sparsematrix_together, y_train)
score = roc_auc_score(y_val, bag4.predict_proba(test_sparsematrix_together)[:,1])
print(score)

0.6214532716915864


In [17]:
X_train = train_df_solve[['dayhour', 'author_post', 'links', 'popularity', 'article_len', 'topics', 'channel', 'author', 'label']]
y_train = train_df_solve['label']
# TF-IDF Fit
tfidf_Xtrain, y_train = train_df, train_df['label']
doc_words = pd.concat([tfidf_Xtrain['article'], tfidf_Xtrain['h1'], tfidf_Xtrain['title'], tfidf_Xtrain['h2']])
tfidf = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
tfidf_channel = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
tfidf_channel_topics = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
tfidf.fit(doc_words)
tfidf_channel.fit(tfidf_Xtrain['channel'])
tfidf_channel_topics.fit(pd.concat([tfidf_Xtrain['channel'], tfidf_Xtrain['topics']]))



TfidfVectorizer(preprocessor=<function preprocessor at 0x000001C26A596C18>,
                tokenizer=<function tokenizer_stem_nostop at 0x000001C253F2DEE8>)

In [18]:
test_df_solve = AuthorProblem(test_df, 'test')
tfidf_Xval = test_df_solve[['dayhour', 'author_post', 'links', 'popularity', 'article_len', 'topics', 'channel', 'author']]

In [19]:
# TF-IDF Transform
text_feature = ['article', 'h1', 'h2']
train_sparse = []
test_sparse = []
for i in text_feature:
    train_sparse.append(tfidf.transform(tfidf_Xtrain[i]))
    test_sparse.append(tfidf.transform(test_df[i]))
train_sparse.append(tfidf_channel.transform(tfidf_Xtrain['channel']))
test_sparse.append(tfidf_channel.transform(test_df['channel']))
train_sparse.append(tfidf_channel_topics.transform(tfidf_Xtrain['channel']))
test_sparse.append(tfidf_channel_topics.transform(test_df['channel']))
train_sparse.append(tfidf_channel_topics.transform(tfidf_Xtrain['topics']))
test_sparse.append(tfidf_channel_topics.transform(test_df['topics']))

In [20]:
from scipy import sparse
train_sparsematrix = sparse.hstack((train_sparse[0], train_sparse[1]))
train_sparsematrix = sparse.hstack((train_sparsematrix, train_sparse[2]))
train_sparsematrix2 = sparse.hstack((train_sparsematrix, train_sparse[4]))
train_sparsematrix2 = sparse.hstack((train_sparsematrix2, train_sparse[5]))
train_sparsematrix = sparse.hstack((train_sparsematrix, train_sparse[3]))
train_sparse[0].shape
train_sparsematrix.shape
train_sparsematrix2.shape
test_sparsematrix = sparse.hstack((test_sparse[0], test_sparse[1]))
test_sparsematrix = sparse.hstack((test_sparsematrix, test_sparse[2]))
test_sparsematrix2 = sparse.hstack((test_sparsematrix, test_sparse[4]))
test_sparsematrix2 = sparse.hstack((test_sparsematrix2, test_sparse[5]))
test_sparsematrix = sparse.hstack((test_sparsematrix, test_sparse[3]))

In [45]:
train_sparse[0].shape

(27643, 135449)

In [None]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
bag_final = BaggingClassifier(base_estimator=MultinomialNB(), n_estimators=500, 
                        max_samples=0.6, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1, random_state=1)
bag_final.fit(train_sparsematrix2, y_train)
pickle.dump(bag_final, open('TFIDF_final.pkl', 'wb'))

In [None]:
from xgboost import XGBClassifier
pipe_final = Pipeline([('author', AuthorTransFormer(gate = 2)),
                  ('channel', ChannelTransFormer(gate = 2)),
                  ('author_channel', AuthorChannelPopTransFormer(gate = 2)),
                  ('topics', TopicsTransFormer(tps_list = topic_dic)),
                  ('topics_author', AuthorTopicsPopTransFormer(tps_list = topic_dic)),
                  ('std', StandardScaler()),
                  ("pca", PCA()),
                  ('clf', XGBClassifier(max_depth=5,learning_rate=0.001,objective="binary:logistic", eval_metric="error"))])

pipe_final.fit(X_train, y_train)
pickle.dump(pipe_final, open('Features_final.pkl', 'wb'))

In [24]:
import pickle
#pipe_final = pickle.load(open('featuresonly.pkl', 'rb'))
#bag_final = pickle.load(open('TFIDFonly.pkl', 'rb'))
pb_list = []
P_1 = pipe_final.predict_proba(tfidf_Xval)[:,1]
P_2 = bag_final.predict_proba(test_sparsematrix)[:,1]
for d in range(len(tfidf_Xval)):
    p1 = P_1[d]
    p2 = P_2[d]
    pb_list.append(p1*0.92 + p2*(1-0.92))
    
result = pb_list.round(1)
print(result[:,1])
submit_ans = pd.DataFrame()
submit_ans['Id'] = test_df['id']
submit_ans['Popularity'] = result[:,1]

print(submit_ans)
submit_ans.to_csv('./out.csv', index=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


ValueError: Number of features of the model must match the input. Model n_features is 348152 and input n_features is 406381.

In [40]:
train_sparsematrix_both = pipeR.fit_transform(X_train)
test_sparsematrix_both = pipeR.transform(tfidf_Xval)
test_sparsematrix_both = np.where(test_sparsematrix_both<0, 0, test_sparsematrix_both)
test_sparsematrix_together = sparse.hstack((test_sparsematrix_both, test_sparsematrix))
train_sparsematrix_both = np.where(train_sparsematrix_both<0, 0, train_sparsematrix_both)
train_sparsematrix_together = sparse.hstack((train_sparsematrix_both, train_sparsematrix))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [41]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
bag5 = BaggingClassifier(base_estimator=MultinomialNB(), n_estimators=500, 
                        max_samples=0.6, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, 
                        n_jobs=1, random_state=1)
bag5.fit(train_sparsematrix_together, y_train)

BaggingClassifier(base_estimator=MultinomialNB(), max_samples=0.6,
                  n_estimators=500, n_jobs=1, random_state=1)

In [42]:
result = bag5.predict_proba(test_sparsematrix_together)
result = result.round(1)
print(result[:,1])
submit_ans = pd.DataFrame()
submit_ans['Id'] = test_df['id']
submit_ans['Popularity'] = result[:,1]

print(submit_ans)
submit_ans.to_csv('./out.csv', index=0)

[0.3 0.5 0.3 ... 0.5 0.3 0.3]
          Id  Popularity
0      27643         0.3
1      27644         0.5
2      27645         0.3
3      27646         0.4
4      27647         0.6
...      ...         ...
11842  39485         0.7
11843  39486         0.6
11844  39487         0.5
11845  39488         0.3
11846  39489         0.3

[11847 rows x 2 columns]


=============================== ENSEMBLE =================================

In [24]:
def ensembleCV(model1, model2, alpha, val_df, X_val_other, X_val_text):
    max = 0
    maxpick = 0
    l = []
    for a in alpha:
        pb_list = []
        P_1 = model1.predict_proba(X_val_other)[:,1]
        P_2 = model2.predict_proba(X_val_text)[:,1]
        for d in range(len(val_df)):
            p1 = P_1[d]
            p2 = P_2[d]
            pb_list.append(p1*a + p2*(1-a))
        score = roc_auc_score(val_df, pb_list)
        print(score)
        if(score > max): 
            max = score 
            maxpick = a
    return(max, maxpick)

In [25]:
pipe1 = pickle.load(open('featuresonly.pkl', 'rb'))
ans = ensembleCV(pipe1, bag3, [(i / 100) for i in range(0, 100, 10)], y_val, X_val, val_sparsematrix2)
print(ans)

0.5545791344548965
0.554755120711208
0.5549559568787115
0.5552092863012391
0.555539784300443
0.5559883630251945
0.5565815600855721
0.5574036440328787
0.558668021481497
0.560191024235356
(0.560191024235356, 0.9)


In [26]:
pipe1 = pickle.load(open('featuresonly.pkl', 'rb'))
ans = ensembleCV(pipe1, bag3, [(i / 100) for i in range(85, 95, 1)], y_val, X_val, val_sparsematrix2)

0.5594287663037532
0.5596202910291238
0.5597929600841755
0.559986579884026
0.5601217703844927
0.560191024235356
0.5602557969546929
0.560321326228703
0.5602285027899409
0.5600164346953646


0.92