# Question 2

In [None]:
# Import libraries
import requests
import os
import folium
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
%pylab inline


First, we have to load the data and vectorize it using tf-idf, this way we will later be able to learn from it

In [2]:
# Loading data and vectorizing if using tf-idf
newsgroups_train = fetch_20newsgroups(subset='train')
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(newsgroups_train.data)
y = newsgroups_train.target


This function just allows to split the data as we want: it is already possible to do so with a function from the scikit library, but having our own let's us have total control over what it does

In [3]:
def split_data(x, y, ratio, seed=1):
    """
    Splits the dataset based on the split ratio
    """
    
    # Set seed
    np.random.seed(seed)
    
    # Generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_train = indices[: index_split]
    index_test = indices[index_split:]
    
    # Create split
    x_1 = x[index_train]
    x_2 = x[index_test]
    y_1 = y[index_train]
    y_2 = y[index_test]
    
    return x_1, x_2, y_1, y_2

x_train, x_test_val, y_train, y_test_val = split_data(x, y, 0.8)
x_test, x_val, y_test, y_val = split_data(x_test_val, y_test_val, 0.5)


Again, the score function is re-written (it already exists in the scikit library), but this way we can fine tune it if we need

In [10]:
def score(y_test_f, rf_probs_f):
    """
    This function will show the scores from the probabilities in rf_probs_f
    This will also result the final score in [0,1]
    rf.score(x_test, y_test) could be used as well, but this way we can fine tune this score function if needed
    """
    count = 0
    for index, probs in enumerate(rf_probs_f):
        
        # Looking for the maximum probability
        max_index, max_prob = 0, 0
        for ind, prob in enumerate(probs):
            if (max_prob < prob):
                max_index = ind
                max_prob = prob
                
        # Now we can compare it to the real value
        if (max_index == y_test_f[index]):
            count += 1
            
    count /= len(y_test_f)
    print("{a}% of correct guesses with a random forest".format(a=count*100))
    return count
    

This function will grid search through the hyperparameters in order to find the best score and the corresponding hyperparameters

In [21]:
def gridsearch_params(x_train_f, y_train_f, x_test_f, y_test_f, max_depth_range, features_range, estimators_range, seed=1):
    best_score = 0
    best_max_depth = 0
    best_max_features = 0
    best_n_estimators = 0
    
    # Grid searching through the parameters
    for max_depth in max_depth_range:
        for max_features in features_range:
            for n_estimators in estimators_range:
                rf = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators, random_state=seed)
                rf.fit(x_train_f, y_train_f)
                rf_probs = rf.predict_proba(x_test_f)
                test_score = score(y_test_f, rf_probs)

                # Finding if this is the best score yet
                if (best_score < test_score):
                    best_score = test_score
                    best_max_depth = max_depth
                    best_max_features = max_features
                    best_n_estimators = n_estimators
                
    print("Found a best score of {a}% for {b} max_depth, {c} max_features and {d} n_estimators".format(a=best_score*100, b=best_max_depth, c=best_max_features, d=best_n_estimators))
    return best_score, best_max_depth, best_max_features, best_n_estimators
    

Now we can start a grid search to determine a good maximum number of features (for the estimators range and the maximum depth range, the higher the better, but this is at a igh computation cost)

In [29]:
# First grid search: looking for something globally, studying for one estimator range and one depth
max_depth_range = [10]
features_range = [100, 500, 800, 900, 1000, 1100, 1200]
estimators_range = [100]
gridsearch_params(x_train, y_train, x_test, y_test, max_depth_range, features_range, estimators_range)


63.748894783377544% of correct guesses with a random forest
69.76127320954907% of correct guesses with a random forest
70.29177718832891% of correct guesses with a random forest
70.11494252873564% of correct guesses with a random forest
70.99911582670202% of correct guesses with a random forest
68.34659593280283% of correct guesses with a random forest
69.05393457117594% of correct guesses with a random forest
Found a best score of 70.99911582670202% for 10 max_depth, 1000 max_features and 100 n_estimators


(0.7099911582670203, 10, 1000, 100)

Now we can try on some custom hyperparameters with a very high max_depth

In [31]:
# With good parameters, using 1000 max number of features
max_depth_range = [100]
features_range = [1000]
estimators_range = [100]
gridsearch_params(x_train, y_train, x_test, y_test, max_depth_range, features_range, estimators_range)


82.67020335985853% of correct guesses with a random forest
Found a best score of 82.67020335985853% for 100 max_depth, 1000 max_features and 100 n_estimators


(0.8267020335985853, 100, 1000, 100)