### Imports

In [1]:
print(f"Hello, {str(None)}!")

Hello, None!


In [42]:
# Web scraping
from math import ceil
from random import randint
import requests as rq
import time

# Data manipulation
import pandas as pd

# Data preprocessing
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import regex as re

# Modeling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_nb import MultinomialNB
from sklearn.pipeline import Pipeline

# Data visualization
from matplotlib import pyplot as pp
import seaborn as sb

In [3]:
subreddits = ["chrome", "edge"]

In [4]:
def scrape_subreddit(subreddit, 
                     total_posts=1000, 
                     limit=100, 
                     after=None, 
                     user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"):

    url = f"https://www.reddit.com/r/{subreddit}.json"
    csv = f"../data/{subreddit}.csv"
    
    all_posts = []
    
    print(f"Started scraping : {url}")

    for _ in range(ceil(total_posts / limit)):
        
        print(f"... after {after}")
        
        try:
            response = rq.get(url, 
                              params={"limit": limit, "after": after}, 
                              headers={"user-agent": user_agent})
        except rq.exceptions.RequestException as error:
            raise SystemExit(error)

        json = response.json()
        new_posts = [post["data"] for post in json["data"]["children"]]
        all_posts.extend(new_posts)
        pd.DataFrame(all_posts).to_csv(csv, index=False)

        # Generate a random sleep duration to look more natural.
        time.sleep(randint(10, 30))
        
        after = json["data"]["after"]
    
    print(f"Finished scraping : {csv}")

In [5]:
chrome = pd.read_csv("../data/chrome.csv")

In [6]:
edge = pd.read_csv("../data/edge.csv")

In [7]:
chrome.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,num_crossposts,media,is_video,is_gallery,media_metadata,gallery_data,crosspost_parent_list,crosspost_parent,author_cakeday,poll_data
0,,chrome,,t2_1agc3zqz,False,,0,False,"US Sentate is trying to undermine encryption, ...",[],...,0,,False,,,,,,,
1,,chrome,,t2_20nrkysg,False,,0,False,Gotta have all the chrome's,[],...,0,,False,,,,,,,
2,,chrome,,t2_lyigt,False,,0,False,My Chrome volume keeps resetting to 100% (I ke...,[],...,0,,False,,,,,,,
3,,chrome,"I don't yet mind the Reading List, but I sure ...",t2_dtcav,False,,0,False,"I don't want to disable ""ðŸ“š Reading List"" but I...",[],...,0,,False,,,,,,,
4,,chrome,,t2_5y032o51,False,,0,False,"Our service, WIRE EMOTION is very simple, it u...",[],...,0,,False,,,,,,,


In [8]:
edge.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,subreddit_subscribers,created_utc,num_crossposts,media,is_video,media_metadata,author_cakeday,is_gallery,gallery_data,poll_data
0,,edge,,t2_5956yded,False,,0,False,Dev Channel update to 90.0.818.0 is going live...,[],...,5645,1615318000.0,0,,False,,,,,
1,,edge,,t2_5956yded,False,,0,False,Dev channel update to 91.0.825.0 is going live...,[],...,5645,1615923000.0,0,,False,,,,,
2,,edge,This is how you can enable this option: Once t...,t2_1n8c9tww,False,,0,False,It seems that Microsoft secretly added an opti...,[],...,5645,1616280000.0,1,,False,"{'z3s57apu1bo61': {'status': 'valid', 'e': 'An...",,,,
3,,edge,,t2_1p0kp2ms,False,,0,False,Long time problem with edge on iOS. Opening a ...,[],...,5645,1616316000.0,0,,False,,,,,
4,,edge,Just opened edge and none of the images on any...,t2_vewj208,False,,0,False,Images not loading,[],...,5645,1616321000.0,0,,False,,,,,


In [9]:
chrome.dropna(how="all", inplace=True)
chrome.drop_duplicates(inplace=True)

In [10]:
edge.dropna(how="all", inplace=True)
edge.drop_duplicates(inplace=True)

In [11]:
chrome.shape

(998, 115)

In [12]:
edge.shape

(933, 111)

In [13]:
[col for col in chrome.columns if col not in edge.columns]

['thumbnail_height', 'thumbnail_width', 'post_hint', 'preview']

In [14]:
chrome["text"] = chrome["title"].str.cat(chrome["selftext"], sep=" ", na_rep="")
chrome["source"] = 0

In [15]:
edge["text"] = edge["title"].str.cat(edge["selftext"], sep=" ", na_rep="")
edge["source"] = 1

In [16]:
X = pd.concat([chrome["text"], edge["text"]], axis="index")
y = pd.concat([chrome["source"], edge["source"]], axis="index")

In [17]:
X.head()

0    US Sentate is trying to undermine encryption, ...
1                         Gotta have all the chrome's 
2    My Chrome volume keeps resetting to 100% (I ke...
3    I don't want to disable "ðŸ“š Reading List" but I...
4    Our service, WIRE EMOTION is very simple, it u...
Name: text, dtype: object

In [18]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: source, dtype: int64

In [19]:
def to_stems(text):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 2. Remove non-letters.
    text = re.sub("https?:[\w\-\;./?#&+=]+", " ", text)
    text = re.sub("[\n\r]", " ", text)
    text = re.sub("[^A-z]", " ", text)
    text = re.sub("\s+", " ", text)
    
    # 3. Convert to lower case, split into individual words.
    words = text.strip().lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stopwords to a set.
    stops = set(stopwords.words("english") + ["chrome", "edge", "google", "microsoft"])
    
    # 5. Remove stopwords.
    words = [word for word in words if word not in stops]
    
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in words]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join(stems)

In [20]:
X = X.apply(to_stems)

In [21]:
X.head()

0    us sentat tri undermin encrypt tell congress o...
1                                                gotta
2    volum keep reset keep absolut noth blast speak...
3    want disabl read list want get rid read list s...
4    servic wire emot simpl use share screen featur...
Name: text, dtype: object

In [22]:
y.value_counts(normalize=True)

0    0.516831
1    0.483169
Name: source, dtype: float64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [None]:
pipe = make_pipeline()

In [24]:
cvec = CountVectorizer()
tvec = TfidfVectorizer()

In [25]:
cvec.fit(X_train)
tvec.fit(X_train)

TfidfVectorizer()

In [26]:
X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [27]:
X_train_tvec = tvec.transform(X_train)
X_test_tvec = tvec.transform(X_test)

In [28]:
lr = LogisticRegression()

In [29]:
lr.fit(X_train_cvec, y_train)

LogisticRegression()

In [30]:
lr.score(X_train_cvec, y_train)

0.9820441988950276

In [31]:
lr.score(X_test_cvec, y_test)

0.6749482401656315

In [32]:
lr.fit(X_train_tvec, y_train)

LogisticRegression()

In [33]:
lr.score(X_train_tvec, y_train)

0.8819060773480663

In [34]:
lr.score(X_test_tvec, y_test)

0.6977225672877847

In [35]:
nb = MultinomialNB()

In [36]:
nb.fit(X_train_cvec, y_train)

MultinomialNB()

In [37]:
nb.score(X_train_cvec, y_train)

0.8832872928176796

In [38]:
nb.score(X_test_cvec, y_test)

0.6749482401656315

In [39]:
nb.fit(X_train_tvec, y_train)

MultinomialNB()

In [40]:
nb.score(X_train_tvec, y_train)

0.888121546961326

In [41]:
nb.score(X_test_tvec, y_test)

0.6956521739130435