In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

In [2]:
df = pd.read_csv("OnlineNewsPopularity.csv")
df = df.drop(columns=["url", " timedelta"])
df.columns

Index([' n_tokens_title', ' n_tokens_content', ' n_unique_tokens',
       ' n_non_stop_words', ' n_non_stop_unique_tokens', ' num_hrefs',
       ' num_self_hrefs', ' num_imgs', ' num_videos', ' average_token_length',
       ' num_keywords', ' data_channel_is_lifestyle',
       ' data_channel_is_entertainment', ' data_channel_is_bus',
       ' data_channel_is_socmed', ' data_channel_is_tech',
       ' data_channel_is_world', ' kw_min_min', ' kw_max_min', ' kw_avg_min',
       ' kw_min_max', ' kw_max_max', ' kw_avg_max', ' kw_min_avg',
       ' kw_max_avg', ' kw_avg_avg', ' self_reference_min_shares',
       ' self_reference_max_shares', ' self_reference_avg_sharess',
       ' weekday_is_monday', ' weekday_is_tuesday', ' weekday_is_wednesday',
       ' weekday_is_thursday', ' weekday_is_friday', ' weekday_is_saturday',
       ' weekday_is_sunday', ' is_weekend', ' LDA_00', ' LDA_01', ' LDA_02',
       ' LDA_03', ' LDA_04', ' global_subjectivity',
       ' global_sentiment_polarity', ' gl

In [3]:
df[' shares'].values[df[' shares'].values < 1400] = 0 #unpopular
df[' shares'].values[df[' shares'].values >= 1400] = 1 #popular
print (sum(df[' shares'].values==0))
print (sum(df[' shares'].values==1))
print (sum(df[' shares'].values==1400))
df[' shares']

18490
21154
0


0        0
1        0
2        1
3        0
4        0
        ..
39639    1
39640    1
39641    1
39642    0
39643    0
Name:  shares, Length: 39644, dtype: int64

In [4]:
results = pd.DataFrame(columns=["Algorithm", "n_estimators", "eta", "max_depth", "n_neighbors", "AUC"])
#result = pd.DataFrame(columns=["Algorithm", "AUC"])

In [6]:
#XGB
n_estimators = [100, 200, 500, 1000]
eta = [0.01, 0.1, 0.3]
max_depth = [2, 3, 4, 5, 6]
for eta in eta:
    for estimators in n_estimators:
        for depth in max_depth:
            model = XGBClassifier(random_state = 42, n_estimators = estimators, eta = eta, max_depth = depth)
            from sklearn.model_selection import StratifiedKFold
            skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
            target = df.loc[:," shares"]
            aucs=[]
            for train_index, test_index in skf.split(df, target):
                train = df.loc[train_index,:]
                test = df.loc[test_index,:]
                model.fit(train.iloc[:, :-1], train.iloc[:, -1])
                predictions = model.predict_proba(test.iloc[:, :-1])
                #print(roc_auc_score(test.iloc[:, -1], predictions, multi_class="ovr"))
                aucs.append(roc_auc_score(test.iloc[:, -1], predictions[:, 1], multi_class="ovr"))
            print("Average: " + str(sum(aucs)/len(aucs)))
            auc = str(sum(aucs)/len(aucs))
            results=results.append({"Algorithm": ["XGBoost"], "n_estimators": [estimators], "eta": [eta], "max_depth": [depth], "n_neighbors": [0], "AUC": [auc]}, ignore_index=True)








































Average: 0.6912349851454342








































Average: 0.7014976377199297








































Average: 0.7077368148597878








































Average: 0.7129394189708307








































Average: 0.7176280322700459




































Average: 0.7023775851246586








































Average: 0.7128825288220939








































Average: 0.7189071569142593








































Average: 0.7230897512346386








































Average: 0.7266700607061811








































Average: 0.7175368059995033








































Average: 0.726706159942169








































Average: 0.7316726963078626








































Average: 0.734193091348933








































Average: 0.7362430497227083








































Average: 0.7259714444760074




































Average: 0.7329298116505202








































Average: 0.7367978323024971








































Average: 0.7383619562707997








































Average: 0.7388573738875408








































Average: 0.725804696413831








































Average: 0.732680575649143








































Average: 0.7353307154454533








































Average: 0.7371472799962483








































Average: 0.7374196300612881








































Average: 0.7315105760034916








































Average: 0.7368584679993725








































Average: 0.7378419607002409








































Average: 0.7372936727157015








































Average: 0.7365489461360084








































Average: 0.7359959506223424








































Average: 0.7390036405994846








































Average: 0.7368457981841929








































Average: 0.7345449529982806








































Average: 0.7319102189523254








































Average: 0.7371852123351488








































Average: 0.7373386132207853








































Average: 0.7327409762194138








































Average: 0.7301959169455757








































Average: 0.7277254397787152








































Average: 0.7330849427677808








































Average: 0.7350176916259108








































Average: 0.7326825790017717








































Average: 0.7272201038712758








































Average: 0.7234704700779562








































Average: 0.7354272295298239








































Average: 0.7344723015482213








































Average: 0.7285466384148817








































Average: 0.7215971052888367








































Average: 0.7169002640750985








































Average: 0.7346228526363291








































Average: 0.7276991162154509








































Average: 0.7183884517393886








































Average: 0.7121868581911683








































Average: 0.7103259578130242








































Average: 0.731450901852554








































Average: 0.7205444988226002








































Average: 0.7099172997924668








































Average: 0.7085278576463814








































Average: 0.7115609426382133


In [7]:
results.to_csv("xgb_cross.csv")

In [5]:
#DECISION TREE
max_depth = [2, 5, 7, 10, 15, 20]
for depth in max_depth:
    model = DecisionTreeClassifier(random_state = 42, max_depth = depth)
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    target = df.loc[:," shares"]
    aucs=[]
    for train_index, test_index in skf.split(df, target):
        train = df.loc[train_index,:] 
        test = df.loc[test_index,:]    
        model.fit(train.iloc[:, :-1], train.iloc[:, -1])
        predictions = model.predict_proba(test.iloc[:, :-1])
        #print(roc_auc_score(test.iloc[:, -1], predictions, multi_class="ovr"))
        aucs.append(roc_auc_score(test.iloc[:, -1], predictions[:, 1], multi_class="ovr"))
    print("Average: " + str(sum(aucs)/len(aucs)))
    auc = str(sum(aucs)/len(aucs))
    results=results.append({"Algorithm": ["Decision Tree"], "n_estimators": [0], "eta": [0], "max_depth": [depth], "n_neighbors": [0], "AUC": [auc]}, ignore_index=True)

Average: 0.6302017836785305
Average: 0.6814030587517437
Average: 0.6895785844291027
Average: 0.6701218888510903
Average: 0.6057070532226595
Average: 0.5793072720015208


In [39]:
#RANDOM FOREST
n_estimators = [100, 200, 500, 1000]
max_depth =  [2, 3, 4, 5, 6]
for estimators in n_estimators:
    for depth in max_depth:
        model = RandomForestClassifier(random_state = 42, n_estimators = estimators, max_depth = depth)
        from sklearn.model_selection import StratifiedKFold
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        target = df.loc[:," shares"]
        aucs=[]
        for train_index, test_index in skf.split(df, target):
            train = df.loc[train_index,:]
            test = df.loc[test_index,:]
            model.fit(train.iloc[:, :-1], train.iloc[:, -1])
            predictions = model.predict_proba(test.iloc[:, :-1])
            #print(roc_auc_score(test.iloc[:, -1], predictions, multi_class="ovr"))
            aucs.append(roc_auc_score(test.iloc[:, -1], predictions[:, 1], multi_class="ovr"))
        print("Average: " + str(sum(aucs)/len(aucs)))
        auc = str(sum(aucs)/len(aucs))
        results=results.append({"Algorithm": ["Random Forest"], "n_estimators": [estimators], "eta": [0], "max_depth": [depth], "n_neighbors": [0], "AUC": [auc]}, ignore_index=True)

Average: 0.6889626073302224
Average: 0.6992657853312096
Average: 0.707344090915978
Average: 0.7119653244760556
Average: 0.7171685129745667
Average: 0.6874245803968672
Average: 0.6982409432425948
Average: 0.7068533818736931
Average: 0.7121589832503258
Average: 0.7173843167861123
Average: 0.6877071379644317
Average: 0.6977047844868807
Average: 0.7063203418854063
Average: 0.7123995141231649
Average: 0.7175392898894487
Average: 0.6877392377641289
Average: 0.6980240542401674
Average: 0.7063848423825637
Average: 0.7124398259399192
Average: 0.7175563948861732


In [47]:
#KNN
n_neighbors = [3, 5, 7, 11]
for neighbors in n_neighbors:
        model = KNeighborsClassifier(n_neighbors = neighbors)
        from sklearn.model_selection import StratifiedKFold
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        target = df.loc[:," shares"]
        aucs=[]
        for train_index, test_index in skf.split(df, target):
            train = df.loc[train_index,:]
            test = df.loc[test_index,:]
            model.fit(train.iloc[:, :-1], train.iloc[:, -1])
            predictions = model.predict_proba(test.iloc[:, :-1])
            #print(roc_auc_score(test.iloc[:, -1], predictions, multi_class="ovr"))
            aucs.append(roc_auc_score(test.iloc[:, -1], predictions[:, 1], multi_class="ovr"))
        print("Average: " + str(sum(aucs)/len(aucs)))
        auc = str(sum(aucs)/len(aucs))
        results=results.append({"Algorithm": ["KNN"], "n_estimators": [0], "eta": [0], "max_depth": [0], "n_neighbors": [neighbors], "AUC": [auc]}, ignore_index=True)


Average: 0.5787578874120692
Average: 0.589177756807866
Average: 0.597778722574276
Average: 0.6074693002468139


In [48]:
results.to_csv("knn_cross.csv")

In [34]:
def classi(my_classifier,i,X_train,X_test,y_train,y_test):
    print(i)
    my_classifier.fit(X_train,y_train)
    predictions = my_classifier.predict(X_test)
    pred_prob1 = my_classifier.predict_proba(X_test)
    #print (i,"prediction",predictions)
    print (i,"accuracy",accuracy_score(y_test,predictions)) #add AUC score
    auc_score1 = roc_auc_score(y_test, pred_prob1[:, 1], multi_class='ovr')
    print(i,"AUC score",auc_score1)
    return (auc_score1)

def alg(df):
    X = df.iloc[:,:-1].values
    y = df.iloc[:,-1:].values
    aucs = []
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= .3)
    print(len(X_train),len(X_test))
    print(y_train, y_test)
    
    names = ["Decision tree","K Nearest Neighbours","XGBoost","Random forest"]
    classifiers = [DecisionTreeClassifier(),KNeighborsClassifier(),XGBClassifier(),RandomForestClassifier()]
    for x,i in zip(classifiers,names):
        my_classifier = x
        auc = [classi(my_classifier,i,X_train,X_test,y_train,y_test)]
        #result=result.append({"Algorithm": [i], "AUC": [auc]}, ignore_index=True)
    
alg(df)

27750 11894
[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [1]] [[0]
 [0]
 [0]
 ...
 [1]
 [1]
 [0]]
Decision tree
Decision tree accuracy 0.5895409450142929
Decision tree AUC score 0.5877117965526523
K Nearest Neighbours


  my_classifier.fit(X_train,y_train)


K Nearest Neighbours accuracy 0.5710442239784765
K Nearest Neighbours AUC score 0.5859104398281437
XGBoost


  return f(**kwargs)


XGBoost accuracy 0.6601647889692281
XGBoost AUC score 0.7140270532865873
Random forest


  my_classifier.fit(X_train,y_train)


Random forest accuracy 0.6652934252564319
Random forest AUC score 0.7212941352840366


In [None]:
results.to_csv("without parameters.csv")