## 2. load json file to database
this jupyter notebook is able to load json files into sqlite3 database core.db with twitter/user table

**input** : json files at folder /extract  
**output**: database core.db at folder ./

In [None]:
import pandas as pd
import json
import sqlite3
import os
import time
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pickle

# create db file and conn
db_file = open('core.db','w+')
conn = sqlite3.connect('core.db',isolation_level = None)
cur = conn.cursor()

# change working dir
os.chdir('./extract')

# get json file name list
file_list = []
for file in os.listdir():
    if os.path.splitext(file)[1][1:] == "json":
        file_list.append(file)
print('amount of json file : ', len(file_list))

In [None]:
# get score using analyzer.polarity_scores

analyzer = SentimentIntensityAnalyzer()
def get_score(sentence):
    vs = analyzer.polarity_scores(sentence)
    return vs['compound']

In [None]:
'''create twitter table'''

query = '''
CREATE TABLE twitter(
    'created_at' TEXT,
    'id' INTEGER,
    'text' TEXT,
    'in_reply_to_status_id' INTEGER,
    'in_reply_to_user_id' INTEGER,
    'user_mention_id' TEXT,
    'hashtags' TEXT,
    'user_id' INTEGER,
    'timestamp_ms' INT,
    'sentiment_score' REAL,
    PRIMARY KEY (id) ON CONFLICT IGNORE
);'''

cur.execute(query)
conn.commit()

In [None]:
'''create user table'''

query = '''
CREATE TABLE user(
    'user_id' INTEGER,
    'user_name' TEXT,
    PRIMARY KEY (user_id) ON CONFLICT IGNORE
);'''

cur.execute(query)
conn.commit()

In [None]:
# extract information from json file to query string
def load_user_cursor(dict_0):
    dict_user = {}
    try:
        dict_user['user_id'] = dict_0['user']['id']
    except KeyError:
        dict_user['user_id'] = '' 
        pass
    
    try:
        dict_user['user_name'] = dict_0['user']['name']
        #--------- smooth insert to sqlite3 ---------#
        if type(dict_user['user_name']) == str:
            if "'" in dict_user['user_name']: # replace single quotation mark
                dict_user['user_name'] = dict_user['user_name'].replace("\'","\"")
        elif dict_user['user_name'] == None:
            dict_user['user_name'] = ''
        #--------------------------------------------#
    except KeyError:
        dict_user['user_name'] = '' 
        pass
    
    query = "INSERT INTO user('user_id', 'user_name') VALUES ('{}','{}');".format(dict_user['user_id'], dict_user['user_name'])

    return query

In [None]:
# extract information from json file to query string
def load_tweet_cursor(dict_0):
    '''load one tweet into dict_tw and insert into database'''
    
    dict_tw = {}
    key_list = ['created_at', 'id', 'text',
                'in_reply_to_status_id',  'in_reply_to_user_id','timestamp_ms']
    
    # put keys into dict_tw, use try...except to aviod missing key
    

    # using loop making dictionary
    for key in key_list:
        try:
            data_value = dict_0[key]
            #--------- smooth insert to sqlite3 ---------#
            if type(data_value) == str:
                if "'" in data_value: # replace single quotation mark
                    data_value = data_value.replace("\'","\"")
            elif data_value == None:
                data_value = ''
            #--------------------------------------------#
            dict_tw[key] = data_value
        except KeyError:
            dict_tw[key] = ''
            pass
    
    # put user id into dict_tw, use try...except to aviod missing key
    try:
        dict_tw['user_id'] = dict_0['user']['id']
    except KeyError:
        dict_tw['user_id'] = '' 
        pass
        
    #------------------------------------hashtags_json------------------------------#
    try:
        hashtags_json = str(dict_0['entities']['hashtags'])
        if hashtags_json != '[]':
            hashtags_str = str(re.findall("'text': '(.*?)'", hashtags_json))
            hashtags_str = hashtags_str.replace('\'','\"')
        else:
            hashtags_str = ''
    except Exception:
        hashtags_str = ''
    #----------------------------------------------------------------------------------#
    
    #-----------------------------------user_mention_json------------------------------#
    try:
        user_mention_json = str(dict_0['entities']['user_mentions'])
        if user_mention_json != '[]':
            user_mention_str = str(re.findall("'id': (\w*)", user_mention_json_dict))
            user_mention_str = user_mention_str.replace('\'','\"')
        else:
            user_mention_str = ''
    except Exception:
        user_mention_str = ''
    #----------------------------------------------------------------------------------#
    
    #-----------------------------------sentiment score--------------------------------#
    try:
        sentiment_score = get_score(dict_tw['text'])
    except Exception:
        sentiment_score = ''
    #----------------------------------------------------------------------------------#
    
    query = "INSERT INTO twitter('created_at', 'id',  'text', 'in_reply_to_status_id', 'in_reply_to_user_id', 'user_mention_id','hashtags','user_id','timestamp_ms','sentiment_score') VALUES ('{}','{}','{}','{}','{}','{}','{}','{}','{}','{}');".format(dict_tw['created_at'], dict_tw['id'], dict_tw['text'],dict_tw['in_reply_to_status_id'],dict_tw['in_reply_to_user_id'],user_mention_str,hashtags_str,dict_tw['user_id'],dict_tw['timestamp_ms'],sentiment_score)
    
    return query


In [None]:
# convert json to dictionary
def ReadJson(json_file):
    data = []
    error_list = []
    line_counter = 0
    with open(json_file) as f:
        for line in f:
            try:
                data.append(json.loads(line))
                line_counter += 1
            except ValueError:
                error_list.append(line_counter)
                line_counter += 1
                pass

    if len(error_list) != 0:
        print('ReadJson {} has {} errors. Errors in line {}.'.format(json_file,len(error_list),error_list))
    return data

In [None]:
'''execute loading : only english'''
print('amount of json file : ', len(file_list))

total_time_start = time.time()                                                              # total time counter start
'''open file (will be replaced by json loop)'''

counter = 0

for json_file_name in file_list:
    tweet_query_lst = []                                                                    # lists for insertion, contains all tweets query in this json file
    user_query_lst = []
    '''time counter starts'''
    starttime = time.time()                                                                 # json file time start
    
    #-----------------------------READ JSON TO LIST----------------------------------#
    '''for loop, loop though the tweets inside one json file'''
    tweets_list = ReadJson(json_file_name)                                                  # convert json into dictionary

    for tweet in tweets_list:
        try:
            if tweet['lang'] == 'en':                                                       # filter: only select english tweets
                tweet_query_lst.append(load_tweet_cursor(tweet))
                user_query_lst.append(load_user_cursor(tweet))
            else:
                pass
        except KeyError:
            pass
    #-----------------------------END JSON READER-----------------------------------#
    
    #-----------------------------START INSERT--------------------------------------#
    tweet_all = tweet_query_lst+user_query_lst
    
    db_start = time.time()
    conn.execute("BEGIN TRANSACTION")
    for query in tweet_all:
        try:
            conn.execute(query)
        except ValueError:
            print('Insert error :',query)
            pass
    conn.execute("COMMIT")
    db_end = time.time()
    #----------------------------END INSERT----------------------------------------#
    
    '''time counter ends'''
    endtime = time.time()                                                                     # end json file time, meaning this json file is done and can be close
#     print ('database in ' + str(round((db_end - db_start),2)) +' sec.')                     # insert time print
    
    '''print out load time'''
    print ('load '+ json_file_name + ' in ' + str(round((endtime - starttime),2)) +' sec.')   # json file time print
    print(counter+1,'/',len(file_list))                                                       # print load progress
    counter += 1


total_time_end = time.time()
print('total time ' + str(round((total_time_end - total_time_start)/60,2)) +' mins.')         # total time print
print('Done')