<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 60px">

# Project 3 : Web APIs & Classification (Chrome vs. Edge)

## Problem Statement

## Table of Contents

## Background

### Imports

In [183]:
# Web scraping
from math import ceil
from random import randint
import requests as rq
import time

# Data manipulation
import pandas as pd

# Data preprocessing
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import regex as re

# Model construction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Model validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Data visualization
from matplotlib import pyplot as pp
import seaborn as sb

The subreddits of our interest  are the two web browsers: **Google Chrome (r/chrome)** and **Microsoft Edge (r/edge)**.

In [3]:
# Define a list of subreddits to scrape.
subreddits = ["chrome", "edge"]

Create a helper function to facilitate our subreddit scraping.

In [4]:
def scrape_subreddit(subreddit, 
                     total_posts=1000, 
                     limit=100, 
                     after=None, 
                     user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"):
    """
    Scrape subreddit posts through Reddit API and save the result in a CSV file.
    
    Parameters
    ----------
    subreddit : str
        A subreddit name.
    total_posts: int
        The (maximum) total number of posts to scrape.
    limit: int
        The number of posts to scrape per each request (maximum: 100).
    after: str
        A Reddit post ID to scrape after.
    user_agent: str
        A user agent string needed for Reddit API requests.
        
    Returns
    -------
    None
    """

    # Create from the input subreddit the corresponding url to scrape and the output CSV file path. 
    url = f"https://www.reddit.com/r/{subreddit}.json"
    csv = f"../data/{subreddit}.csv"
    
    # Create a list to store scraped posts.
    all_posts = []
    
    print(f"Started scraping : {url}")

    # Loop to collect posts from each request to the Reddit API.
    for _ in range(ceil(total_posts / limit)):
        
        print(f"... after {after}")
        
        # Make a request to the Reddit API.
        try:
            response = rq.get(url, 
                              params={"limit": limit, "after": after}, 
                              headers={"user-agent": user_agent})
        except rq.exceptions.RequestException as error:
            raise SystemExit(error)

        # Get a response in the form of Python dictionary.
        json = response.json()
        
        # Collect the post details.
        new_posts = [post["data"] for post in json["data"]["children"]]
        
        # Append new posts to the existing posts.
        all_posts.extend(new_posts)
        
        # Export all collected posts to the output CSV file, overwriting the previous if any.
        pd.DataFrame(all_posts).to_csv(csv, index=False)

        # Generate a random sleep duration to look more natural.
        time.sleep(randint(10, 30))
        
        # Get the post ID to continue scraping in the next request.
        after = json["data"]["after"]
    
    print(f"Finished scraping : {csv}")

Perform Reddit API scraping.

Load the scraped results into DataFrames.

In [5]:
chrome = pd.read_csv("../data/chrome.csv")

In [6]:
edge = pd.read_csv("../data/edge.csv")

In [7]:
chrome.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,num_crossposts,media,is_video,is_gallery,media_metadata,gallery_data,crosspost_parent_list,crosspost_parent,author_cakeday,poll_data
0,,chrome,,t2_1agc3zqz,False,,0,False,"US Sentate is trying to undermine encryption, ...",[],...,0,,False,,,,,,,
1,,chrome,,t2_20nrkysg,False,,0,False,Gotta have all the chrome's,[],...,0,,False,,,,,,,
2,,chrome,,t2_lyigt,False,,0,False,My Chrome volume keeps resetting to 100% (I ke...,[],...,0,,False,,,,,,,
3,,chrome,"I don't yet mind the Reading List, but I sure ...",t2_dtcav,False,,0,False,"I don't want to disable ""ðŸ“š Reading List"" but I...",[],...,0,,False,,,,,,,
4,,chrome,,t2_5y032o51,False,,0,False,"Our service, WIRE EMOTION is very simple, it u...",[],...,0,,False,,,,,,,


In [8]:
edge.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,subreddit_subscribers,created_utc,num_crossposts,media,is_video,media_metadata,author_cakeday,is_gallery,gallery_data,poll_data
0,,edge,,t2_5956yded,False,,0,False,Dev Channel update to 90.0.818.0 is going live...,[],...,5645,1615318000.0,0,,False,,,,,
1,,edge,,t2_5956yded,False,,0,False,Dev channel update to 91.0.825.0 is going live...,[],...,5645,1615923000.0,0,,False,,,,,
2,,edge,This is how you can enable this option: Once t...,t2_1n8c9tww,False,,0,False,It seems that Microsoft secretly added an opti...,[],...,5645,1616280000.0,1,,False,"{'z3s57apu1bo61': {'status': 'valid', 'e': 'An...",,,,
3,,edge,,t2_1p0kp2ms,False,,0,False,Long time problem with edge on iOS. Opening a ...,[],...,5645,1616316000.0,0,,False,,,,,
4,,edge,Just opened edge and none of the images on any...,t2_vewj208,False,,0,False,Images not loading,[],...,5645,1616321000.0,0,,False,,,,,


Remove (completely) empty and duplicate rows.

In [9]:
chrome.dropna(how="all", inplace=True)
chrome.drop_duplicates(inplace=True)

In [10]:
edge.dropna(how="all", inplace=True)
edge.drop_duplicates(inplace=True)

In [11]:
chrome.shape

(998, 115)

In [12]:
edge.shape

(933, 111)

In [13]:
# The following columns are found in the Google Chrome subreddit but not in the Microsoft Edge subreddit.
[col for col in chrome.columns if col not in edge.columns]

['thumbnail_height', 'thumbnail_width', 'post_hint', 'preview']

Despite a great number of columns, we are only interested in the **title** and **body text** of each post. We are going to concatenate both columns into a new `text` column. Also, we will label posts from the Google Chrome subreddit as `0` and posts from the Microsoft Edge subreddit as `1` in a `source` column. This helps in our classifier model construction later.

In [14]:
chrome["text"] = chrome["title"].str.cat(chrome["selftext"], sep=" ", na_rep="")
chrome["source"] = 0

In [15]:
edge["text"] = edge["title"].str.cat(edge["selftext"], sep=" ", na_rep="")
edge["source"] = 1

In [16]:
X = pd.concat([chrome["text"], edge["text"]], axis="index")
y = pd.concat([chrome["source"], edge["source"]], axis="index")

In [17]:
X.head()

0    US Sentate is trying to undermine encryption, ...
1                         Gotta have all the chrome's 
2    My Chrome volume keeps resetting to 100% (I ke...
3    I don't want to disable "ðŸ“š Reading List" but I...
4    Our service, WIRE EMOTION is very simple, it u...
Name: text, dtype: object

In [18]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: source, dtype: int64

Create a helper function to transform an uncleaned text into a single string of stemmed words.

In [19]:
def to_stems(text):
    """
    Convert a raw text to a string of stemmed words.
    
    Parameters
    ----------
    text : str
        A raw text.

    Returns
    -------
    str
        A string of stemmed words.
    """
    
    # Remove web urls.
    text = re.sub("https?:[\w\-\;./?#&+=]+", " ", text)
    
    # Remove newline characters.
    text = re.sub("[\n\r]", " ", text)
    
    # Remove non-alphabet characters (including numbers).
    text = re.sub("[^A-z]", " ", text)
    
    # Remove redundant whitespaces.
    text = re.sub("\s+", " ", text)
    
    # Strip surrounding (left and right) whitespaces, convert to lowercase, and split into individual words.
    words = text.strip().lower().split()
    
    # Define a set of stopwords (words without any significant meanings).
    # Taboo words which can easily differentiate the two subreddits are also added.  
    stops = set(stopwords.words("english") + ["chrome", "edge", "google", "microsoft"])
    
    # Remove stopwords (including taboo words).
    words = [word for word in words if word not in stops]
    
    # Perform stemming on each word to find commonality.
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in words]
    
    # Join a list of stemmed words into a long string separated by space.
    return " ".join(stems)

In [20]:
X = X.apply(to_stems)

In [21]:
X.head()

0    us sentat tri undermin encrypt tell congress o...
1                                                gotta
2    volum keep reset keep absolut noth blast speak...
3    want disabl read list want get rid read list s...
4    servic wire emot simpl use share screen featur...
Name: text, dtype: object

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [203]:
y_train.value_counts(normalize=True)

0    0.516839
1    0.483161
Name: source, dtype: float64

In [204]:
y_test.value_counts(normalize=True)

0    0.516796
1    0.483204
Name: source, dtype: float64

Search over the following values of hyperparameters:
- Maximum number of documents needed to include tokens: 10%, 25%, 50%, 75%
- Maximum number of features fit: 1000, 2000, 3000, 4000
- Minimum number of documents needed to include tokens: 2, 3, 4
- Number of tokens in n-grams: 1, 2 
- Regularization parameter: 0.01, 0.1, 1, 10, 100

In [205]:
transformers = [# Count vectorizer.
                {"step": ("cv", CountVectorizer()), 
                 "params": {"cv__max_df": [0.10, 0.25, 0.50, 0.75, 0.90], 
                            "cv__max_features": [1000, 2000, 3000, 4000, 5000], 
                            "cv__min_df": [2, 3, 4], 
                            "cv__ngram_range": [(1, 1), (1, 2), (1, 3)]}}, 
                # TF-IDF vectorizer.
                {"step": ("tv", TfidfVectorizer()), 
                 "params": {"tv__max_df": [0.10, 0.25, 0.50, 0.75, 0.90], 
                            "tv__max_features": [1000, 2000, 3000, 4000, 5000], 
                            "tv__min_df": [2, 3, 4], 
                            "tv__ngram_range": [(1, 1), (1, 2), (1, 3)]}}]

In [206]:
estimators = [# Naives Bayes.
              {"step": ("nb", MultinomialNB()), 
               "params": {"nb__alpha": [0.0001, 0.001, 0.01, 0.1, 1]}}, 
              # Logistic regression.
              {"step": ("lr", LogisticRegression()), 
               "params": {"lr__C": [0.0001, 0.001, 0.01, 0.1, 1], 
                          "lr__max_iter": [1000]}}]

In [207]:
def make_grid(transformer, estimator):
    steps = [transformer["step"], estimator["step"]]
    params = {**transformer["params"], **estimator["params"]}
    pipe = Pipeline(steps)
    grid = GridSearchCV(pipe, param_grid=params, cv=5)
    return grid

In [208]:
grid = make_grid(transformers[0], estimators[0])

In [None]:
grid.fit(X_train, y_train)

In [187]:
grid.score(X_train, y_train)

0.918825561312608

In [188]:
grid.score(X_test, y_test)

0.6701030927835051

In [189]:
grid.best_params_

{'cv__max_df': 0.25,
 'cv__max_features': 5000,
 'cv__min_df': 2,
 'cv__ngram_range': (1, 2),
 'lr__C': 0.1,
 'lr__max_iter': 1000}

In [190]:
grid.best_score_

0.7081354135612309

In [191]:
grid.score(X, y)

0.893837389953392