### Import dependencies

In [50]:
from bs4 import BeautifulSoup as bs
import re
import sqlite3
import pandas as pd
import fasttext
import json
import numpy as np

### Import data

In [51]:
conn = sqlite3.connect("gfm.db")
feed = pd.read_sql_query("SELECT * FROM feed_tb", conn)

### Clean data

In [52]:
feed = feed.drop_duplicates('url')

feed['nchar']= [len(x) for x in feed['fund_description']]
feed = feed[feed['nchar'] >= 100]

#regular expression to remove url
url_reg = r'(?:(?:http|https):\/\/)?([-a-zA-Z0-9.]{2,256}\.[a-z]{2,4})\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?'

#regular expression to remove emojis
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)

feed['fund_description'] = [bs(x).get_text().replace("\n","").replace(u'\xa0', u' ') for x in feed['fund_description']]
feed['fund_description'] = [re.sub(url_reg,"",x) for x in feed['fund_description']]
feed['fund_description'] = [emoji_pattern.sub(r'', x) for x in feed['fund_description']]
feed['fund_description'] = [x.strip() for x in feed['fund_description']]


feed['nchar']= [len(x) for x in feed['fund_description']]
feed = feed[feed['nchar'] >= 100]

### Exclude non-english campaigns

In [53]:
def GetLanguage(df):
    path_to_pretrained_model = 'fasttext_models/lid.176.bin'
    fmodel = fasttext.load_model(path_to_pretrained_model)
    r = fmodel.predict(feed['fund_description'].to_list())
    labels = r[0]
    labels = [i[0].split('__')[2] for i in labels]
    scores = r[1]
    scores = [i[0] for i in scores]
    return labels, scores

In [54]:
feed['language'], feed['lang_score'] = GetLanguage(feed)



In [55]:
feed = feed[feed['language'] == 'en']

In [56]:
len(feed)

89493

### Deidentify urls

In [57]:
feed = feed[['url','fund_description']]
feed.loc[:,'url_deid'] = [int(x) for x in range(len(feed))]

In [58]:
#export map
feed[['url','url_deid']].to_csv('spark-jsl-ccsr/deid/url_deid_map.csv', index=False)

### Export data

In [59]:
#format data for export
del feed['url']
feed = feed.rename(columns={'url_deid':'url','fund_description':'text'})

In [60]:
#algorithm takes approximately 2 seconds per campaign
#let's say we want each chunk to run 12 hours
#12*60*60 = 43200 seconds / 2 seconds per campaign = ~22,000 campaigns
#so we will split data into 4 chunks

dfs = np.array_split(feed, 4)

In [61]:
with open('spark-jsl-ccsr/input_data/feed_chunk_1.json', 'w', encoding='utf-8') as file:
    dfs[0].to_json(file, orient="records", force_ascii=False)
    
with open('spark-jsl-ccsr/input_data/feed_chunk_2.json', 'w', encoding='utf-8') as file:
    dfs[1].to_json(file, orient="records", force_ascii=False)
    
with open('spark-jsl-ccsr/input_data/feed_chunk_3.json', 'w', encoding='utf-8') as file:
    dfs[2].to_json(file, orient="records", force_ascii=False)
    
with open('spark-jsl-ccsr/input_data/feed_chunk_4.json', 'w', encoding='utf-8') as file:
    dfs[3].to_json(file, orient="records", force_ascii=False)