# Question 2

In [1]:
# Import libraries
import requests
import os
import folium
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%pylab inline


Populating the interactive namespace from numpy and matplotlib


First, we have to load the data and vectorize it using tf-idf, this way we will later be able to learn from it

In [2]:
# Loading data and vectorizing if using tf-idf
newsgroups_train = fetch_20newsgroups(subset='train')
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(newsgroups_train.data)
y = newsgroups_train.target


This function just allows to split the data as we want: it is already possible to do so with a function from the scikit library, but having our own let's us have total control over what it does

In [3]:
def split_data(x, y, ratio, seed=1):
    """
    Splits the dataset based on the split ratio
    """
    
    # Set seed
    np.random.seed(seed)
    
    # Generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_train = indices[: index_split]
    index_test = indices[index_split:]
    
    # Create split
    x_1 = x[index_train]
    x_2 = x[index_test]
    y_1 = y[index_train]
    y_2 = y[index_test]
    
    return x_1, x_2, y_1, y_2

x_train, x_test_val, y_train, y_test_val = split_data(x, y, 0.8)
x_test, x_val, y_test, y_val = split_data(x_test_val, y_test_val, 0.5)


Again, the score function is re-written (it already exists in the scikit library), but this way we can fine tune it if we need

In [4]:
def score(y_test_f, rf_probs_f):
    """
    This function will show the scores from the probabilities in rf_probs_f
    This will also result the final score in [0,1]
    rf.score(x_test, y_test) could be used as well, but this way we can fine tune this score function if needed
    """
    rf_pred = np.zeros(len(y_test_f))
    count = 0
    for index, probs in enumerate(rf_probs_f):
        
        # Looking for the maximum probability
        max_index, max_prob = 0, 0
        for ind, prob in enumerate(probs):
            if (max_prob < prob):
                max_index = ind
                max_prob = prob
        
        rf_pred[index] = max_index
        # Now we can compare it to the real value
        if (max_index == y_test_f[index]):
            count += 1
            
    count /= len(y_test_f)
    print("{a}% of correct guesses with a random forest".format(a=count*100))
    return count, rf_pred
    

This function will grid search through the hyperparameters in order to find the best score and the corresponding hyperparameters

In [5]:
def gridsearch_params(x_train_f, y_train_f, x_test_f, y_test_f, max_depth_range, features_range, estimators_range, seed=1):
    best_pred = np.zeros(len(y_test_f))
    best_score = 0
    best_max_depth = 0
    best_max_features = 0
    best_n_estimators = 0
    
    # Grid searching through the parameters
    for max_depth in max_depth_range:
        for max_features in features_range:
            for n_estimators in estimators_range:
                rf = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators, random_state=seed)
                rf.fit(x_train_f, y_train_f)
                rf_probs = rf.predict_proba(x_test_f)
                test_score, rf_pred = score(y_test_f, rf_probs)

                # Finding if this is the best score yet
                if (best_score < test_score):
                    best_pred = rf_pred
                    best_score = test_score
                    best_max_depth = max_depth
                    best_max_features = max_features
                    best_n_estimators = n_estimators
                
    print("Found a best score of {a}% for {b} max_depth, {c} max_features and {d} n_estimators".format(a=best_score*100, b=best_max_depth, c=best_max_features, d=best_n_estimators))
    return best_pred, best_score, best_max_depth, best_max_features, best_n_estimators
    

Now we can start a grid search to determine a good maximum number of features (for the estimators range and the maximum depth range, the higher the better, but this is at a igh computation cost)

In [6]:
# First grid search: looking for something globally, studying for one estimator range and one depth
max_depth_range = [10]
features_range = [100, 500, 800, 900, 1000, 1100, 1200]
estimators_range = [100]
gridsearch_params(x_train, y_train, x_test, y_test, max_depth_range, features_range, estimators_range)


63.748894783377544% of correct guesses with a random forest
69.76127320954907% of correct guesses with a random forest
70.29177718832891% of correct guesses with a random forest
70.11494252873564% of correct guesses with a random forest
70.99911582670202% of correct guesses with a random forest
68.34659593280283% of correct guesses with a random forest
69.05393457117594% of correct guesses with a random forest
Found a best score of 70.99911582670202% for 10 max_depth, 1000 max_features and 100 n_estimators


(array([ 18.,   8.,  16., ...,  10.,   6.,   6.]),
 0.7099911582670203,
 10,
 1000,
 100)

Now we can try on some custom hyperparameters with a very high max_depth, using the optimal feature range we found before

In [7]:
# With good parameters, using 1000 max number of features
max_depth_range = [100]
features_range = [1000]
estimators_range = [100]
best_pred, best_score, best_max_depth, best_max_features, best_n_estimators = gridsearch_params(x_train, y_train, x_test, y_test, max_depth_range, features_range, estimators_range)


82.67020335985853% of correct guesses with a random forest
Found a best score of 82.67020335985853% for 100 max_depth, 1000 max_features and 100 n_estimators


So we finally have around 83% of matches, which is a decent amount given that we are working with text.
With the prediction we just computed, we can easily build the confusion matrix.

In [9]:
print("Labels:")
for i in range(20):
    print("{a}: ".format(a=i) + newsgroups_train.target_names[i])
print("\n")
print(confusion_matrix(y_test, best_pred, labels=[i for i in range (20)]))


Labels:
0: alt.atheism
1: comp.graphics
2: comp.os.ms-windows.misc
3: comp.sys.ibm.pc.hardware
4: comp.sys.mac.hardware
5: comp.windows.x
6: misc.forsale
7: rec.autos
8: rec.motorcycles
9: rec.sport.baseball
10: rec.sport.hockey
11: sci.crypt
12: sci.electronics
13: sci.med
14: sci.space
15: soc.religion.christian
16: talk.politics.guns
17: talk.politics.mideast
18: talk.politics.misc
19: talk.religion.misc


[[44  0  0  0  0  0  0  0  0  0  0  0  1  0  0 10  0  0  1  2]
 [ 0 30  8  3  4  5  2  0  0  1  0  0  2  1  1  0  0  0  1  0]
 [ 0  4 40  2  0  1  1  0  0  0  0  0  1  0  1  0  0  0  1  0]
 [ 0  1  7 45  3  1  4  1  1  0  0  2  1  0  0  0  0  0  1  0]
 [ 0  2  0  3 43  0  2  0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  2  1  0  0 53  0  0  0  1  0  0  2  0  0  0  0  0  0  0]
 [ 0  1  0  1  0  0 47  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  3  0  0  1 51  3  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  2  0  0  1  0  1  1 61  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  1  0  0  0  1 

So this confusion matrix clearly shows that the system does a good job at classifying the articles.
If we study the last case (last line in the matrix), which is talk.religion.misc (label 19), we can see that there are 17 correct matches, and 11 matches with soc.religion.christian (label 15), which totally makes sense since those two subjects are extremly close (so the system has a harder time distinguishing them).