# 1. Fully connected neural network

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score, classification_report, confusion_matrix


In [2]:
df = pd.read_csv('adult.csv')
df = df.drop('fnlwgt',axis=1)
df = df.replace('?', np.NaN)
df = df.dropna()

In [3]:
df['workclass'] = pd.Categorical(df['workclass'])
df['education'] = pd.Categorical(df['education'])
df['marital-status'] = pd.Categorical(df['marital-status'])
df['occupation'] = pd.Categorical(df['occupation'])
df['relationship'] = pd.Categorical(df['relationship'])
df['race'] = pd.Categorical(df['race'])
df['gender'] = pd.Categorical(df['gender'])
df['native-country'] = pd.Categorical(df['native-country'])
df['income'] = pd.Categorical(df['income'])

In [4]:
for column in df.select_dtypes(include='category').columns:
    if column=='income':
        continue
    df = pd.concat([df, pd.get_dummies(df[column], prefix=column)],axis=1)
    df.drop([column],axis=1, inplace=True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income',axis=1), df['income'].cat.codes, test_size=0.2, random_state=42)

In [6]:
for column in df.select_dtypes(include='int64').columns:
    scale = StandardScaler().fit(X_train[[column]])
    X_train[[column]] = scale.transform(X_train[[column]])
    X_test[[column]] = scale.transform(X_test[[column]])

In [7]:
model = Sequential()
model.add(Dense(32, input_dim=103, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [8]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [9]:
model.fit(X_train,y_train,epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x1062115c0>

In [10]:
y_pred = model.predict(X_test)> 0.5

In [11]:
f1_score(y_test, y_pred, average='binary')

0.6873485215705283

In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      6842
           1       0.74      0.64      0.69      2203

    accuracy                           0.86      9045
   macro avg       0.81      0.78      0.80      9045
weighted avg       0.85      0.86      0.85      9045



In [13]:
print(confusion_matrix(y_test, y_pred))

[[6337  505]
 [ 785 1418]]


### NN lost to AdaBoost and GBoost but outperformed all other models

# 2. CNN - in file 2_CNN.ipynb

# 3. RNN

In [14]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalMaxPooling1D, SimpleRNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
data = pd.read_csv('clickbait_data.csv')

In [16]:
text = data['headline'].values
labels = data['clickbait'].values
text_train, text_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=42)
print(text_train.shape, text_test.shape)

(25600,) (6400,)


In [17]:
data.headline.str.len().max()

135

In [18]:
vocab_size = 5000
maxlen = 135
embedding_size = 32

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
x_test = tokenizer.texts_to_sequences(text_test)

X_train = pad_sequences(X_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [19]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=maxlen))
model.add(SimpleRNN(32, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 135, 32)           160000    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 135, 32)           2080      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 162,113
Trainable params: 162,113
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=512, validation_data=(x_test, y_test), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
prediction = model.predict(x_test)>0.5
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 96.84%
