## Fetch Subreddits data using PushShift API

<img src="Images/WordCloud-Black.png"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 10px;" />

In [1]:
import numpy as np
import pandas as pd
import requests
import datetime
from json.decoder import JSONDecodeError

In [2]:
def importFoodComments(subreddit,size):
    
    ## Get latest 100 posts
    url = 'https://api.pushshift.io/reddit/search/comment'

    ## Dataframe to store all JapaneseFood posts
    data_final = []

    ## Starting Parameters
    params = {
        'subreddit' : subreddit,
        'size' : 100,
        }
    ## Fetch Data for first 100 rows - max 1 time limit for API
    res = requests.get(url,params)
    print(res.status_code) ## Test Connection

    ## Store data from API (.JSON format) to a list
    data = res.json()
    posts = data['data'] ## temp. list to store first 100 rows

    ## Append first 100 rows to our main dataframe
    data_final.append(posts)
    
    ## Get 3000 additional rows of data, based on time before the time for last post in our first 100 rows

    lastdate = posts[-1]['created_utc']

    add3000_posts = [] ## temp stores our additional 3000 rows

    ## Loop 30 times (100 rows * 30 = 3000 rows)
    
    to_loop = size // 100
    
    for i in range(to_loop):

        params2 = {
        'subreddit' : subreddit,
        'size' : 100,
        'before' : lastdate ## Grab 100 more posted rows before last 100 rows (based on time).
        }

        res = requests.get(url,params2)
        #print(res.status_code)

        data = res.json()
        new_posts = data['data']  
        #print(len(new_posts))
        lastdate = new_posts[-1]['created_utc']

        ## Append additional 100 rows to our main list
        add3000_posts.append(new_posts)

        if i%5==0:
            print(f'{100 + (i*100)} of {size + 100}...')



    ## Save Data to our main List for (Japanese) Food items

    for listofrows in sum(add3000_posts,[]):
        data_final[0].append(listofrows)

    print(len(data_final[0]))
    print('DONE.')
    
    return data_final[0]

In [3]:
def importFoodSubmission(subreddit,size):
    
    ## Get latest 100 posts
    url = 'https://api.pushshift.io/reddit/search/submission'

    ## Dataframe to store all JapaneseFood posts
    data_final = []

    ## Starting Parameters
    params = {
        'subreddit' : subreddit,
        'size' : 100,
        }
    ## Fetch Data for first 100 rows - max 1 time limit for API
    res = requests.get(url,params)
    print(res.status_code) ## Test Connection

    ## Store data from API (.JSON format) to a list
    data = res.json()
    posts = data['data'] ## temp. list to store first 100 rows

    ## Append first 100 rows to our main dataframe
    data_final.append(posts)
    
    ## Get 3000 additional rows of data, based on time before the time for last post in our first 100 rows

    lastdate = posts[-1]['created_utc']

    add3000_posts = [] ## temp stores our additional 3000 rows

    ## Loop 30 times (100 rows * 30 = 3000 rows)
    
    to_loop = size // 100
    
    for i in range(to_loop):

        params2 = {
        'subreddit' : subreddit,
        'size' : 100,
        'before' : lastdate ## Grab 100 more posted rows before last 100 rows (based on time).
        }

        res = requests.get(url,params2)
        #print(res.status_code)

        data = res.json()
        new_posts = data['data']  
        #print(len(new_posts))
        lastdate = new_posts[-1]['created_utc']

        ## Append additional 100 rows to our main list
        add3000_posts.append(new_posts)

        if i%5==0:
            print(f'{100 + (i*100)} of {size + 100}...')



    ## Save Data to our main List for (Japanese) Food items

    for listofrows in sum(add3000_posts,[]):
        data_final[0].append(listofrows)

    print(len(data_final[0]))
    print('DONE.')
    
    return data_final[0]

In [6]:
#japfoodcom2 = importFoodSubmission('JapaneseFood', 5000)

200
100 of 5100...
600 of 5100...
1100 of 5100...
1600 of 5100...
2100 of 5100...
2600 of 5100...
3100 of 5100...
3600 of 5100...
4100 of 5100...
4600 of 5100...
5100
DONE.


In [7]:
#japfoodcom = importFoodComments('JapaneseFood', 5000)

200
100 of 5100...
600 of 5100...
1100 of 5100...
1600 of 5100...
2100 of 5100...
2600 of 5100...
3100 of 5100...
3600 of 5100...
4100 of 5100...
4600 of 5100...
5100
DONE.


In [8]:
#chifoodcom = importFoodSubmission('chinesefood', 5000)

200
100 of 5100...
600 of 5100...
1100 of 5100...
1600 of 5100...
2100 of 5100...
2600 of 5100...
3100 of 5100...
3600 of 5100...
4100 of 5100...
4600 of 5100...
5100
DONE.


In [19]:
#chifoodcom2 = importFoodComments('chinesefood', 5000)

200
100 of 5100...
600 of 5100...
1100 of 5100...
1600 of 5100...
2100 of 5100...
2600 of 5100...
3100 of 5100...
3600 of 5100...
4100 of 5100...
4600 of 5100...
5100
DONE.


In [26]:
#mexfoodcom = importFoodComments('mexicanfood', 5000)

200
100 of 1100...
600 of 1100...
1100
DONE.


In [22]:
#mexfoodcom2 = importFoodSubmission('mexicanfood', 5000)

200
100 of 1100...
600 of 1100...
1100
DONE.


In [27]:
## Japanese Food - extract titles and add comments as titles too
jap_header = pd.DataFrame(japfoodcom2)[['title']]
jap_commmm = pd.DataFrame(japfoodcom)[['body']]
jap_commmm.rename(columns={'body':'title'},inplace=True)
jap_food = pd.concat([jap_header,jap_commmm])
jap_food

Unnamed: 0,title
0,Broccoli Cheddar Soup | Broccoli Soup | Homema...
1,Kaisendon (with ikura and maguro). Super tasty!
2,I made oyakodon for the first time! Very tasty
3,"Kaisendon, yumminess"
4,Homemade mochi I made today (flavours in comme...
...,...
5095,Use it in essentially every Japanese recipe fo...
5096,"You’re doing good work, there is no comparison..."
5097,Just want to say this is beautiful and so well...
5098,Even more amazing - they look absolutely gorge...


In [28]:
## Chinese Food - extract titles and add comments as titles too
chi_header = pd.DataFrame(chifoodcom)[['title']]
chi_commmm = pd.DataFrame(chifoodcom2)[['body']]
chi_commmm.rename(columns={'body':'title'},inplace=True)
chi_food = pd.concat([chi_header,chi_commmm])
chi_food

Unnamed: 0,title
0,"Chengdu delicious food！“Maocai Roasted Duck”, ..."
1,pumpkin pot
2,Stir fried tomato with scrambled egg
3,Supper: barbecue.
4,Spice mix identification
...,...
5095,That looks amazing.
5096,"Wanna try making steamed egg? It’s super easy,..."
5097,Hungry
5098,Haha thanks!!


In [32]:
## Mexican Food - extract titles and add comments as titles too
mex_header = pd.DataFrame(mexfoodcom2)[['title']]
mex_commmm = pd.DataFrame(mexfoodcom)[['body']]
mex_commmm.rename(columns={'body':'title'},inplace=True)
mex_food = pd.concat([mex_header,mex_commmm])

In [30]:
## Add Lables (Define Categories)
jap_food['label'] = 0
chi_food['label'] = 1
mex_food['label'] = 2

In [31]:
## Split into 'train' and 'test' for final evaluation
## Train Data
jap_food[:10000].to_csv('FoodData//jap_train.csv',index=False)
chi_food[:10000].to_csv('FoodData//chi_train.csv',index=False)
mex_food[:10000].to_csv('FoodData//mex_train.csv',index=False)


## Test Data
jap_food[10000:].to_csv('FoodData/jap_test.csv',index=False)
chi_food[10000:].to_csv('FoodData/chi_test.csv',index=False)
mex_food[10000:].to_csv('FoodData/mex_test.csv',index=False)

## Further Data Exploration under separate file

[Click here for EDA file](./DataPreprocessingandModelBuilding.ipynb#Comparing-scores-for-all-Models)