# Project 3: Web APIs & Classification

## 1. Problem Statement

Using Reddit's API, we have collected posts from two subreddits:
* Casual Conversations
* Board Games

Then we will use NLP to train a classifier on which subreddit a given post came from (A binary classification problem).

## Conclusion:

Multinomial Naive Bayes with TFIDF gave the best accuracy score of 96.03%

The other techniques used include:
1. Logistic Regression
2. Multinomial Naive Bayes:
3. RandomForest methods.
4. Ensemble techniques to optimize the models.

## 2. Import Libraries

In [101]:
import requests
import pandas as pd
import time
import random

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 3. Get the Reddit posts

Initialize the URLs of the selected Reddits and specify the counts of iterations to be done for each reddit.

In [2]:
#The subreddits URLs
url = pd.DataFrame([
    ['casualconversation', 'https://www.reddit.com/r/casualconversation.json', 32],
    ['boardgames', 'https://www.reddit.com/r/boardgames.json', 38],
])

## 3.1 The dictionary structure of reddit_dict

reddit_dict = {
    kind:,
    data: {
        modhash:, 
        dist:, 
        children:{ # has 26 elements
            approved_at_utc:, 
            subreddit:,  #The cell directly above gives you the class label, aka your target.
            selftext:,   #Mapping to the first post
            author_fullname:, 
            saved:, 
            mod_reason_title:, 
            gilded:, 
            clicked:, 
            title:,  # The title of the post.
            link_flair_richtext:, 
            subreddit_name_prefixed:, 
            hidden:, 
            pwls:, 
            link_flair_css_class:, 
            downs:, 
            thumbnail_height:, 
            hide_score:, 
            name:, 
            quarantine:, 
            link_flair_text_color:, 
            author_flair_background_color:, 
            subreddit_type:, 
            ups:, 
            total_awards_received:, 
            media_embed:, 
            thumbnail_width:, 
            author_flair_template_id:, 
            is_original_content:, 
            user_reports:, 
            secure_media:, 
            is_reddit_media_domain:, 
            is_meta:, 
            category:, 
            secure_media_embed:, 
            link_flair_text:, 
            can_mod_post:, 
            score:, 
            approved_by:, 
            thumbnail:, 
            edited:, 
            author_flair_css_class:, 
            author_flair_richtext:, 
            gildings:, 
            post_hint:, 
            content_categories:, 
            is_self:, 
            mod_note:, 
            created:, 
            link_flair_type:, 
            wls:, 
            banned_by:, 
            author_flair_type:, 
            domain:, 
            selftext_html:, 
            likes:, 
            suggested_sort:, 
            banned_at_utc:, 
            view_count:, 
            archived:, 
            no_follow:, 
            is_crosspostable:, 
            pinned:, 
            over_18:, 
            preview:, 
            all_awardings:, 
            media_only:, 
            can_gild:, 
            spoiler:, 
            locked:, 
            author_flair_text:, 
            visited:, 
            num_reports:, 
            distinguished:, 
            subreddit_id:, 
            mod_reason_by:, 
            removal_reason:, 
            link_flair_background_color:, 
            id:, 
            is_robot_indexable:, 
            report_reasons:, 
            author:, 
            num_crossposts:, 
            num_comments:, 
            send_replies:, 
            whitelist_status:, 
            contest_mode:, 
            mod_reports:, 
            author_patreon_flair:, 
            author_flair_text_color:, 
            permalink:, 
            parent_whitelist_status:, 
            stickied:, 
            url:, 
            subreddit_subscribers:, 
            created_utc:, 
            media:, 
            is_video:            
        },
        after:, 
        before:
    }
}
        

## 3.2 Function to read through a given reddit and return the number of posts based on the specified iteration counts

In [19]:
# A function to read the reddit posts
def read_reddit(url1, subreddit_name, iterations_range):

    posts = []
    after = None

    for a in range(iterations_range):
        if after == None:
            current_url = url1
        else:
            current_url = url1 + '?after=' + after
       
        res = requests.get(current_url, headers={'User-agent': 'Pink Inc 1.0'})

        if res.status_code != 200:
            print('Status error', res.status_code)
            break

        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']

        if a > 0:
            prev_posts = pd.read_csv(subreddit_name + '.csv')
            current_df = pd.DataFrame()
            pd.DataFrame(posts).to_csv(subreddit_name + '.csv', index = False)
        else:
            pd.DataFrame(posts).to_csv(subreddit_name + '.csv', index = False)

        if a % 10 == 0:
            print(str(a) + " iterations done")

        # Be a good internet citizen and generate a random sleep duration and not overload the Reddit servers
        time.sleep(1)

    return posts 

## 3.2 Save reddit posts

Save the dataframe of reddit posts into a .csv file as per the name given.

In [20]:
def store_reddit(posts, subreddit_name):
    pd.DataFrame(posts).to_csv(subreddit_name + '.csv', index = False)

## 3.3 Call the functions to read and save the reddit posts

Also check if the post entries are unique based on the 'name' feature.

In [21]:
for i in range(2):
    reddit_posts = read_reddit(url.iloc[i][1], url.iloc[i][0], url.iloc[i][2])
    
    if len(set(pd.DataFrame(reddit_posts)['name'])) == len(reddit_posts):
        store_reddit( reddit_posts, url.iloc[i][0])
    
        print("\n" + str(len(reddit_posts)) + " Unique posts stored for reddit: "+ url.iloc[i][0])
    
    else:
        print("\n" + str(len(set(pd.DataFrame(reddit_posts)['name']))) + " Unique posts stored for reddit: "+ url.iloc[i][0])
        print(str(len(reddit_posts)) + " Total posts stored for reddit: "+ url.iloc[i][0])


0 iterations done
10 iterations done
20 iterations done
30 iterations done

797 Unique posts stored for reddit: casualconversation
0 iterations done
10 iterations done
20 iterations done
30 iterations done

946 Unique posts stored for reddit: boardgames


## 3.4 Convert the reddit posts into dataframes and do a train test split

In [114]:
# Read from both subreddits
reddit_posts1 = pd.read_csv(url.iloc[0][0] + '.csv')
reddit_posts2 = pd.read_csv(url.iloc[1][0] + '.csv')

reddit_posts1.dropna(subset=['title','selftext'],inplace=True)
reddit_posts2.dropna(subset=['title','selftext'],inplace=True)

# Create X data frame with data from both the subreddits
X = pd.DataFrame(reddit_posts1[['title','selftext']])
X = X.append(reddit_posts2[['title','selftext']])

# Create feature "text" in X which contains lowercase text from the reddit title and posts. 
X['text'] = X['title'] + X['selftext']
X['text'].str.lower()
X = X['text'].str.lower()

# Create y data frame with data from both the subreddits
y = pd.DataFrame(reddit_posts1[["subreddit"]])
y = y.append(reddit_posts2[["subreddit"]])

# Create feature "target" in Y which is the target value
y['subreddit'] = y['subreddit'].apply(lambda X:1 if X==url.iloc[1][0] else 0)


# perform train/test split:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y['subreddit'], random_state=42, test_size=.2)


# vect = CountVectorizer(stop_words="english")
# vect.fit(X['text'].values.astype('U'))

# # Instantiate tokenizer.
# tokenizer = RegexpTokenizer('[a-z]\w+')


# # Run tokenizer.
# tokenizer.tokenize(s)

# dicti = vect.vocabulary_

# with open('file10.txt', 'bx') as file:
#     file.write(str(dicti).encode("utf-8"))
    
# # Read the file
# f = open('file10.txt','r', encoding="utf8")

# # Read file into string
# s = f.read()

# # Split string using delimiter - in my case it was comma. Change as needed
# my_list = s.split(',')

# print(len(my_list))
# # Convert list into dataframe and write it into a csv file

# pd.DataFrame(my_list).to_csv("file11.csv")

# #pd.DataFrame(vect.vocabulary_).to_csv("vect" + '.csv', index = False)



# 4. Text processing and EDA

## 4.1 Summary

NA rows deleted in Casual Conversations = 1
NA rows deleted in boardgames =  106
Baseline word count = 16521
After removal of stop words = 16228
After changing all to lowercase = 16228
After retaining only the alphabetic text = 13049

## 4.2 Function to Display Wordcloud

In [None]:
def display_wordcount(subreddit_df, iterations_range):
    corpus_string = ''

    for i in range(iterations_range):
    corpus_string = corpus_string + subreddit_df.iloc[i]['text']


    mycloud_2 = WordCloud(width=1000, height=1000,
                    collocations=False, #get rid of duplicates
                   normalize_plurals=True).generate_from_text(corpus_string)

    plt.figure(figsize = (10, 10), facecolor = None)
    plt.imshow(mycloud_2)
    plt.axis("off")
    plt.tight_layout(pad = 0)

## 4.3 CountVectorize and transform the train and the test data 



In [57]:
cvec = CountVectorizer(lowercase=True, token_pattern='[a-z]\w+', stop_words=ENGLISH_STOP_WORDS)
cvec.fit(X_train)
print(len(cvec.get_feature_names()))
#print(cvec.get_feature_names())
X_train_cv = pd.DataFrame(cvec.transform(X_train).todense(),columns=cvec.get_feature_names())
X_test_cv = pd.DataFrame(cvec.transform(X_test).todense(),columns=cvec.get_feature_names())

print(X_train_cv.shape)
print(X_test_cv.shape)


13049
(1308, 13049)
(328, 13049)


# 5 Modeling

## 5.1 Logistic Regression

In [131]:
lr = LogisticRegression()

lr.fit(X_train_cv, y_train)

y_pred = lr.predict(X_test_cv)

print('accuracy score',accuracy_score(y_test, y_pred))
print('accuracy score',lr.score(X_test_cv, y_test))
lr = LogisticRegression()
lr.fit(X_train_cv, y_train)
y_pred = lr.predict(X_test_cv)
print('accuracy score',accuracy_score(y_test, y_pred))
print('accuracy score',lr.score(X_test_cv, y_test))

#columns
columns=cvec.get_feature_names()

  y = column_or_1d(y, warn=True)


accuracy score 0.9542682926829268
accuracy score 0.9542682926829268


  y = column_or_1d(y, warn=True)


accuracy score 0.9542682926829268
accuracy score 0.9542682926829268


### a. Analyzing Keywords via Beta Coefficients

In [72]:
# Analyzing Keywords via Beta Coefficients
lr_coef = pd.DataFrame(lr.coef_, columns = columns)
df_coef = lr_coef.T.sort_values(by = 0, ascending = False)
df_coef

# key word for boardgames:
#   game; boardgame, games, baord, play, copy cards, tabletop, kickstarter, buying, box, player

# key words for Casualconversations:
#   school, reddit, life, talk, think, feel, phone, job, video, movies, birthday, work, tell, watching, 

Unnamed: 0,0
game,1.968815
boardgame,1.110741
games,1.039157
board,0.962355
play,0.948913
copy,0.888411
cards,0.854315
tabletop,0.821358
kickstarter,0.765846
buying,0.735457


### b. Confusion Matrix

In [92]:
# confusion matrix:

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted casualconversation', 'predicted boardgames'], 
                     index=['actual casualconversation', 'actual boardgames'])
cm_df

Unnamed: 0,predicted casualconversation,predicted boardgames
actual casualconversation,152,8
actual boardgames,7,161


In [86]:
y_test['subreddit'].value_counts()

1    168
0    160
Name: subreddit, dtype: int64

## 5.2 TFIDF Vectorizer & Logistic Regression:


In [132]:

model = make_pipeline(TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS),
                      LogisticRegression(),
                      )
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('accuracy score',accuracy_score(y_test, y_pred))

accuracy score 0.9542682926829268


  y = column_or_1d(y, warn=True)


### a. Confusion matrix:


In [78]:
# confusion matrix: 

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted casualconversation', 'predicted boardgames'], 
                     index=['actual casualconversation', 'actual boardgames'])
cm_df

Unnamed: 0,predicted casualconversation,predicted boardgames
actual casualconversation,154,6
actual boardgames,9,159


In [85]:
y_test['subreddit'].value_counts()

1    168
0    160
Name: subreddit, dtype: int64

## 5.3 Multinomial Naive Bayes:

In [119]:
nb = MultinomialNB()
nb.fit(X_train_cv, y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [120]:
y_pred = nb.predict(X_test_cv)

In [121]:
accuracy_score(y_test, y_pred)

0.9542682926829268

### a. Confusion matrix:

In [122]:
# confusion matrix: 

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted casualconversation', 'predicted boardgames'], 
                     index=['actual casualconversation', 'actual boardgames'])
cm_df

Unnamed: 0,predicted casualconversation,predicted boardgames
actual casualconversation,152,8
actual boardgames,7,161


In [123]:
y_test['subreddit'].value_counts()

1    168
0    160
Name: subreddit, dtype: int64

## 5.4 Multinomial Naive Bayes with TFIDF:

In [124]:

nb_tfidf = make_pipeline(TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS),
                      nb)
nb_tfidf.fit(X_train, y_train)
y_pred = nb_tfidf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [125]:
accuracy_score(y_test, y_pred)

0.9603658536585366

### a. Confusion Matrix

In [126]:
# confusion matrix: 

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted casualconversation', 'predicted boardgames'], 
                     index=['actual casualconversation', 'actual boardgames'])
cm_df

Unnamed: 0,predicted casualconversation,predicted boardgames
actual casualconversation,151,9
actual boardgames,4,164


## 5.5 Pipeline - Count Vectorizer & Logistic Regression:

In [127]:
# same as above but with a Pipeline:
model = make_pipeline(CountVectorizer(stop_words=ENGLISH_STOP_WORDS),LogisticRegression())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('accuracy score',accuracy_score(y_test, y_pred))

accuracy score 0.9542682926829268


  y = column_or_1d(y, warn=True)


## 5.6 RandomForest with CountVectorizer - Gridsearch Params + Pipeline

In [110]:
# same as above but with Gridsearch & pipeline:

rf_model = make_pipeline(CountVectorizer(stop_words=ENGLISH_STOP_WORDS),
                      RandomForestClassifier(n_estimators= 7, random_state = 42))
#params={'n_estimators' : [5, 7, 10]}
params={}
gs= GridSearchCV(rf_model, param_grid=params)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_test)
#print('best params', gs.best_params_)
print('accuracy score',accuracy_score(y_test, y_pred))
print('best cv score', gs.best_score_)
print('test score', gs.score(X_test, y_test))

  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)


accuracy score 0.9115853658536586
best cv score 0.8983180428134556
test score 0.9115853658536586


  self._final_estimator.fit(Xt, y, **fit_params)


In [105]:
# confusion matrix: 

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted casualconversation', 'predicted boardgames'], 
                     index=['actual casualconversation', 'actual boardgames'])
cm_df

Unnamed: 0,predicted casualconversation,predicted boardgames
actual casualconversation,148,12
actual boardgames,17,151


### a. Pipeline - RandomForest with TFIDFVectorizer & Gridsearch Params:

In [109]:

rf_model_2 = make_pipeline(TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS),
                      RandomForestClassifier(n_estimators=7, random_state=42))
#params={'n_estimators' : [5, 7, 10]}
gs1= GridSearchCV(rf_model_2, param_grid=params)
gs1.fit(X_train, y_train)
y_pred = gs1.predict(X_test)
print('accuracy score',accuracy_score(y_test, y_pred))
print('best cv score', gs1.best_score_)
print('test score', gs1.score(X_test, y_test))

  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)
  self._final_estimator.fit(Xt, y, **fit_params)


accuracy score 0.8810975609756098
best cv score 0.8983180428134556
test score 0.8810975609756098


  self._final_estimator.fit(Xt, y, **fit_params)


In [108]:
# confusion matrix: 

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(data=cm, columns=['predicted casualconversation', 'predicted boardgames'], 
                     index=['actual casualconversation', 'actual boardgames'])
cm_df

Unnamed: 0,predicted casualconversation,predicted boardgames
actual casualconversation,135,25
actual boardgames,14,154
