## Import libraries

In [1]:
import requests
import time
import datetime
import pandas as pd
from nltk.corpus import wordnet
import nltk
#nltk.download('wordnet')
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

## Scrapping

In [2]:
#create timestamp for posts
d_end = datetime.date(2019,7,17)
unixtime_end = int(time.mktime(d_end.timetuple()))
d_start = datetime.date(2018,7,17)
unixtime_start = int(time.mktime(d_start.timetuple()))

In [3]:
sg_url = "https://api.pushshift.io/reddit/search/submission/?subreddit=singapore&sort=desc&sort_type=created_utc&after={}&before={}&size=1000" .format(unixtime_start,unixtime_end)

In [4]:
jp_url = "https://api.pushshift.io/reddit/search/submission/?subreddit=japan&sort=desc&sort_type=created_utc&after={}&before={}&size=1000" .format(unixtime_start,unixtime_end)

In [5]:
class url_prep:
    
    def __init__(self, url):
        self.url = url   
    
    def request (self):
        headers ={"user-agent" :"Bleep blorp bot 0.2"}
        url_res = requests.get(self.url, headers=headers)
        self.json = url_res.json()
        self.key = sorted(url_res.json().keys())
        self.len = len(self.json["data"])

In [6]:
sg = url_prep(sg_url)
jp = url_prep(jp_url)

In [7]:
countries = [sg,jp]
for country in countries:
    country.request()
    print(country.key)
    print(country.len)

['data']
1000
['data']
1000


In [8]:
sg.json["data"][0].keys() #to find out dict keys within data 

dict_keys(['all_awardings', 'allow_live_comments', 'author', 'author_flair_css_class', 'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 'author_flair_type', 'author_fullname', 'author_patreon_flair', 'can_mod_post', 'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id', 'is_crosspostable', 'is_meta', 'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video', 'link_flair_background_color', 'link_flair_richtext', 'link_flair_text_color', 'link_flair_type', 'locked', 'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink', 'pinned', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler', 'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers', 'subreddit_type', 'thumbnail', 'title', 'total_awards_received', 'updated_utc', 'url', 'whitelist_status', 'wls'])

## Data Cleaning

In [9]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return "Non English"
    else:
        return "English"

In [10]:
for country in countries:
    country.title_data =[]
    country.subreddit_data = []
    country.type_data = []
    for i in range(country.len):
        title =  country.json["data"][i]["title"]
        country.title_data.append (title)
        subreddit =  country.json["data"][i]["subreddit"]
        country.subreddit_data.append (subreddit)
        word = isEnglish(country.json["data"][i]["title"])
        country.type_data.append (word)
        country.data_posts = pd.DataFrame(country.title_data, columns=["title"])
        country.data_posts = country.data_posts.join (pd.DataFrame(country.subreddit_data, columns=["subreddit"]))
        country.data_posts = country.data_posts.join (pd.DataFrame(country.type_data, columns=["type"]))
        country.data_posts = country.data_posts[country.data_posts.type != 'Non English']

In [11]:
print (jp.data_posts.shape)
print (sg.data_posts.shape)

(896, 3)
(923, 3)


In [12]:
frames = [jp.data_posts, sg.data_posts]
data_finalposts = pd.concat(frames)

In [13]:
data_finalposts

Unnamed: 0,title,subreddit,type
0,This coin laundry in Tokyo really doesn't want...,japan,English
1,Pedestrians walk on bustling Dotombori Street ...,japan,English
2,"looking for ""mouko tanmen nakamoto spicy ramen...",japan,English
3,"5 days in Tokyo, leave suggestions below please!!",japan,English
4,"Glen Wood: ""Paternity Harassment: Glen Wood vs...",japan,English
5,School issues are No. 1 reason behind youth su...,japan,English
6,Japan's defense chief has 'no plan' to dispatc...,japan,English
8,Talent at Japan's Biggest Agency to Pledge Not...,japan,English
9,An American baseball player of Fukuoka Softban...,japan,English
10,"Hello fellow Japanese redditors, I'm lookin fo...",japan,English


In [14]:
data_finalposts.reset_index(drop=True, inplace=True)

## Classification Models

#### Engineer a feature to turn `subreddit types` into a 1/0 column, where 1 indicates `Japan`.

In [15]:
data_finalposts["source"] = [1 if data_finalposts['subreddit'][i]=="japan" else 0 for i in range(len(data_finalposts.index))]

In [16]:
X = data_finalposts.title
y = data_finalposts.source.ravel()

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [18]:
class model_evaluation:
   
    def __init__(self, y_test, predicted_value):
        self.y_test = y_test
        self.predicted_value = predicted_value
        
    def confusion_matrix (self):
        tn, fp, fn, tp = confusion_matrix(self.y_test, self.predicted_value).ravel()
        print("True Negatives: %s" % tn)
        print("False Positives: %s" % fp)
        print("False Negatives: %s" % fn)
        print("True Positives: %s" % tp)
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        print("Precision: {}" .format (tp/(tp+fp)))
        print("Recall: {}" .format (tp/(tp+fn)))
        F1_score = 2*((precision*recall)/(precision+recall))
        print ("F1 score: {}" .format(2*((precision*recall)/(precision+recall))))
        self.score = F1_score

#### Opt 1: Evaluate the performance of a Logistic Regression on the features extracted by the CountVectorizer

In [19]:
lrtext_clf = Pipeline([('vect', CountVectorizer()),
                     ('lr', LogisticRegression())])
lrtuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)], #The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used
    'lr__solver': ["lbfgs"]
}

In [20]:
lr = GridSearchCV(lrtext_clf, lrtuned_parameters, cv=10)
lr.fit(X_train, y_train)
predicted_lr = lr.predict(X_test)
lrg = model_evaluation (y_test,predicted_lr)
lrg.confusion_matrix ()

True Negatives: 204
False Positives: 27
False Negatives: 61
True Positives: 163
Precision: 0.8578947368421053
Recall: 0.7276785714285714
F1 score: 0.78743961352657


#### Opt 2: Fit a Multinomial Naive Bayes model wtih cvec!

In [21]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)], #The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [22]:
clf = GridSearchCV(text_clf, tuned_parameters, cv=10)
clf.fit(X_train, y_train)
predicted_nbcvec = clf.predict(X_test)
nbcvec = model_evaluation (y_test,predicted_nbcvec)
nbcvec.confusion_matrix ()

True Negatives: 182
False Positives: 49
False Negatives: 34
True Positives: 190
Precision: 0.7949790794979079
Recall: 0.8482142857142857
F1 score: 0.8207343412526998


#### Opt 3: Fit a Multinomial Naive Bayes model with Tfid!

In [23]:
text_tclf = Pipeline([('Tvect', TfidfVectorizer()),
                     ('tclf', MultinomialNB())])
tuned_tparameters = {
    'Tvect__ngram_range': [(1, 1), (1, 2), (2, 2)], #The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used
    'tclf__alpha': [1, 1e-1, 1e-2],
}

In [24]:
tclf = GridSearchCV(text_tclf, tuned_tparameters, cv=10)
tclf.fit(X_train, y_train)
predicted_nbtvec = tclf.predict(X_test)
nbtvec = model_evaluation (y_test,predicted_nbtvec)
nbtvec.confusion_matrix ()

True Negatives: 186
False Positives: 45
False Negatives: 37
True Positives: 187
Precision: 0.8060344827586207
Recall: 0.8348214285714286
F1 score: 0.8201754385964911


#### Summary

In [25]:
print ("F1 score - Logistic regression with cvec {}" .format(lrg.score))
print ("F1 score - Multinomial Naive Bayes model wtih cvec {}" .format(nbcvec.score))
print ("F1 score - Multinomial Naive Bayes model with Tfid  {}" .format(nbtvec.score))

F1 score - Logistic regression with cvec 0.78743961352657
F1 score - Multinomial Naive Bayes model wtih cvec 0.8207343412526998
F1 score - Multinomial Naive Bayes model with Tfid  0.8201754385964911
