In [None]:
from sklearn import svm
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import validation_curve, learning_curve

import matplotlib.pyplot as plt

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
promotional_articles = pd.read_csv("/kaggle/input/wikipedia-promotional-articles/promotional.csv")
good_articles = pd.read_csv("/kaggle/input/wikipedia-promotional-articles/good.csv")


In [None]:
promotional_articles.shape, good_articles.shape

In [None]:
SAMPLING_FRACTION = 0.1
corpus = promotional_articles.sample(frac=SAMPLING_FRACTION)["text"]
labels = np.ones(corpus.shape[0])

good_sample = good_articles.sample(frac=SAMPLING_FRACTION)["text"]
corpus = corpus.append(good_sample)
labels = np.append(labels, np.zeros(good_sample.shape[0]))

corpus.shape, labels.shape

# Feature extraction

In [None]:
%%time
# use natural language toolkit
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

lemmatized_corpus = []
i = 0
for sentence in corpus:
    lemmatized_corpus.append(" ".join([stemmer.stem(word.lower()) for word in nltk.word_tokenize(sentence)]))
    i += 1
    if i%1000 == 0:
        print (f"Reached loop : {i}")


In [None]:
%%time

from collections import defaultdict

counter = defaultdict(int)
vocab = set()

for s in lemmatized_corpus:
    for w in s.split(" "):
        counter[w] += 1 
        if counter[w] > 20:
            vocab.add(w)

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('count', CountVectorizer(vocabulary=vocab)),
                 ('tfid', TfidfTransformer())]).fit(lemmatized_corpus)

features = pipe['count'].transform(lemmatized_corpus).toarray()
tfidf_features = pipe.transform(lemmatized_corpus)


In [None]:
features.shape, tfidf_features.shape

In [None]:
def get_samples(sample_size=None):
    if sample_size is None:
        sample_size = labels.shape[0]

    choices = np.random.choice(np.arange(labels.shape[0]), sample_size, replace=False)
    return tfidf_features[choices], labels[choices]

## Tuning Decision tree hyper-parameter.****

In [None]:
%%time

def plot_decision_tree_validations(sample_size, depths, cv=5):
    features, labels = get_samples(sample_size)
    
    train_scores, valid_scores = validation_curve(
        tree.DecisionTreeClassifier(), features, labels, param_name="max_depth", param_range=depths, cv=cv)

    print(f"depths : {depths}")
    print(f"train_scores : {train_scores.mean(axis=1)}")
    print(f"valid_scores : {valid_scores.mean(axis=1)}")
    print("--" * 10)
    
    plt.clf()
    plt.plot(depths, valid_scores.mean(axis=1), 'ro-', label="Validation Score")
    plt.plot(depths, train_scores.mean(axis=1), 'go-', label="Training Score")
    plt.xlabel("Max depth hyperparameter")
    plt.ylabel("Training/Validation Score")
    plt.title(f"Validation curve for Decision Tree (Sample size: {sample_size})")
    plt.legend()
    plt.show()
    
plot_decision_tree_validations(sample_size=1000, depths=np.arange(10, 200, 10))

## Tuning Boosting hyperparameter - number of weak learners ****

In [None]:
%%time

def plot_boosting_validations(sample_size, num_weak_learners_range, cv=5):
    features, labels = get_samples(sample_size)
    
    train_scores, valid_scores = validation_curve(
        AdaBoostClassifier(), features, labels, param_name="n_estimators", param_range=num_weak_learners_range, cv=cv)

    print(f"num_weak_learners : {num_weak_learners_range}")
    print(f"train_scores : {train_scores.mean(axis=1)}")
    print(f"valid_scores : {valid_scores.mean(axis=1)}")
    print("--" * 10)

    plt.clf()
    plt.plot(num_weak_learners_range, valid_scores.mean(axis=1), 'ro-', label="Validation Score")
    plt.plot(num_weak_learners_range, train_scores.mean(axis=1), 'go-', label="Training Score")
    plt.xlabel("Hyperparameter (Number of weak learners)")
    plt.ylabel("Training/Validation Score")
    plt.title(f"Validation curve for Boosting (Sample size: {sample_size})")
    plt.legend()
    plt.show()
    
plot_boosting_validations(sample_size=1000, num_weak_learners_range=np.arange(10, 100, 10))


## Tuning KNN algorithm

In [None]:
%%time

def plot_knn_validations(sample_size, k_range, cv=5):
    features, labels = get_samples(sample_size)

    train_scores, valid_scores = validation_curve(
        KNeighborsClassifier(), features, labels, param_name="n_neighbors", param_range=k_range, cv=cv)

    print(f"k_range : {k_range}")
    print(f"train_scores : {train_scores.mean(axis=1)}")
    print(f"valid_scores : {valid_scores.mean(axis=1)}")
    print("--" * 10)
    
    plt.clf()
    plt.plot(k_range, valid_scores.mean(axis=1), 'ro-', label="Validation Score")
    plt.plot(k_range, train_scores.mean(axis=1), 'go-', label="Training Score")
    plt.xlabel(f"Hyperparameter (Number of neighbours (K))")
    plt.ylabel("Training/Validation Score")
    plt.title(f"Validation curve for KNN (Sample size: {sample_size})")
    plt.legend()
    plt.show()
    
plot_knn_validations(sample_size=1000, k_range=np.arange(1, 20, 2))

## Tuning NN hidden layers

In [None]:
%%time

def plot_nn_validations(sample_size, hidden_layers, layer_width=10, cv=5):
    features, labels = get_samples(sample_size)
    
    hidden_layers_range = [(i, layer_width) for i in hidden_layers]
    
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1, max_iter=4000)
    train_scores, valid_scores = validation_curve(
        clf, features, labels, param_name="hidden_layer_sizes", param_range=hidden_layers, cv=cv)

    print(f"hidden_layers : {hidden_layers}")
    print(f"train_scores : {train_scores.mean(axis=1)}")
    print(f"valid_scores : {valid_scores.mean(axis=1)}")
    print("--" * 10)
    
    plt.clf()
    plt.plot(hidden_layers, valid_scores.mean(axis=1), 'ro-', label="Validation Score")
    plt.plot(hidden_layers, train_scores.mean(axis=1), 'go-', label="Training Score")
    plt.xlabel(f"Hyperparameter (Number of hidden layers with width={layer_width})")
    plt.ylabel("Training/Validation Score")
    plt.title(f"Validation curve for NN (Sample size: {sample_size})")
    plt.legend()
    plt.show()
    
plot_nn_validations(sample_size=1000, hidden_layers=np.arange(1, 200, 20), layer_width=10)

## Training curves

In [None]:
def plotLearningCurve(clf, name, sample_size, train_sizes, cv=5):
    features, labels = get_samples(sample_size)

    train_sizes_used, train_scores, valid_scores = learning_curve(
         clf, features, labels, train_sizes=train_sizes, cv=cv)

    print(f"Learning curve for {name}")
    print(f"train_sizes : {train_sizes_used}")
    print(f"train_scores : {train_scores.mean(axis=1)}")
    print(f"valid_scores : {valid_scores.mean(axis=1)}")
    print("--" * 10)

    plt.clf()
    plt.plot(train_sizes_used, valid_scores.mean(axis=1), 'ro-', label="Validation Score")
    plt.plot(train_sizes_used, train_scores.mean(axis=1), 'go-', label="Training Score")
    plt.xlabel("Training sample size")
    plt.ylabel("Training/Validation Error")
    plt.title(f"Learning curve for {name} classifier (Sample size: {sample_size})")
    plt.legend()
    plt.show()

In [None]:
## Tuned classifiers.

nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(50, 10), random_state=1, max_iter=4000)
boosting_clf = AdaBoostClassifier(n_estimators=100)
dt_clf = tree.DecisionTreeClassifier(max_depth=25)
knn_clf = KNeighborsClassifier(n_neighbors=1)
svm_clf = svm.SVC()

## Sample size = 1000

In [None]:
%%time

sample_size = 1000
train_sizes = np.arange(100, 800, 50)

plotLearningCurve(dt_clf, "Decision Tree", sample_size=sample_size, train_sizes=train_sizes)
plotLearningCurve(knn_clf, "KNN", sample_size=sample_size, train_sizes=train_sizes)


In [None]:
%%time

plotLearningCurve(svm.SVC(), "SVM", sample_size=sample_size, train_sizes=train_sizes)


In [None]:
%%time 

plotLearningCurve(boosting_clf, "Boosting", sample_size=sample_size, train_sizes=train_sizes)


In [None]:
%%time

plotLearningCurve(nn_clf, "Neural nets", sample_size=sample_size, train_sizes=train_sizes)


## Sample size = 5000

In [None]:
sample_size = 5000
train_sizes = np.arange(400, 4000, 400)

In [None]:
%%time
plotLearningCurve(dt_clf, "Decision Tree", sample_size=sample_size, train_sizes=train_sizes)


In [None]:
%%time
plotLearningCurve(knn_clf, "KNN", sample_size=sample_size, train_sizes=train_sizes)


In [None]:
%%time
plotLearningCurve(boosting_clf, "Boosting", sample_size=sample_size, train_sizes=train_sizes)


In [None]:
%%time
plotLearningCurve(svm.SVC(), "SVM", sample_size=sample_size, train_sizes=train_sizes)


In [None]:
%%time
plotLearningCurve(nn_clf, "Neural nets", sample_size=sample_size, train_sizes=train_sizes)


## Testing & Performance**

In [None]:
from datetime import datetime
from sklearn.metrics import accuracy_score

features, labels = get_samples(5000)
train_features, train_labels = features[:4500], labels[:4500]
test_features, test_labels = features[4500:], labels[4500:]

def train(clf):
    t_start = datetime.now()
    t_clf = clf.fit(train_features, train_labels)
    t_end = datetime.now()
    print (f"Trained in {t_end - t_start} time")
    return t_clf
    
def test(clf):
    t_start = datetime.now()
    predicted_labels = clf.predict(test_features)
    acc_score = accuracy_score(test_labels, predicted_labels)
    t_end = datetime.now()
    print (f"Predicted in {t_end - t_start} time with accuracy score: {acc_score}")
    return acc_score
    

In [None]:
%%time 
x = train(dt_clf)
test(x)

In [None]:
%%time
print ("Testing and performance of Boosting")
test(train(boosting_clf))

In [None]:
%%time
print ("Testing and performance of KNN")
test(train(knn_clf))

In [None]:
%%time
print ("Testing and performance of NN")
test(train(nn_clf))

In [None]:
%%time
print ("Testing and performance of SVM")
test(train(svm_clf))
