In [11]:
import json
import os
import glob
import re
import pandas as pd
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta
from pathlib import Path

In [12]:
def get_election_data(path):
    df = pd.read_csv(path, lineterminator='\n')
    text_data = df['text'].to_list()
    dt_data = df['created_at'].to_list()
    
    return text_data, dt_data

In [13]:
def position_aspects(data, dt_data, aspect):
    asp_match = []
    pos = []
    lbls = []
    hdls = []
    dates = []
    
    for idx, txtd in enumerate(data):
        if isinstance(txtd, str):
            txtd = txtd.replace('\n','')
            txtd = txtd.replace('\t','')
            txtd = txtd.replace('\r','')
            txtd = txtd.rstrip()
            txtd = txtd.lstrip()
            
            txtd = re.sub(r"http\S+", "", txtd)
            txtd = re.sub(' +', ' ', txtd)
            
            
            if len(txtd) < 1:
                continue
            
            txtd = txtd.lower()
            
            s_idx = txtd.find(aspect)
            
            if s_idx > -1:
                asp_match.append(aspect)
                pos.append(str(s_idx) + ',' + str(s_idx + len(aspect)))
                lbls.append('positive')
                hdls.append(txtd)
                dates.append(dt_data[idx])
        else:
            print(txtd)
        #except:
        #    continue
            
    return hdls, asp_match, lbls, pos, dates

In [14]:
def write_data(txtd, pos, asp, lbl, dates, path):
    
    df = pd.DataFrame(
    {'headlines': txtd,
     'terms': asp,
     'dates': dates
    })
    
    
    os.makedirs(path, exist_ok=True)
    
    df.to_csv(path + '/hds.csv', index=False)
    
    
    with open(path + '/review.txt', "w") as output:
        for row in txtd:
            #str_d = str(row.rstrip('\r\n'))
            output.write(row + '\n')

    with open(path + '/position.txt', "w") as output:
        for row in pos:
            output.write(str(row) + '\n')

    with open(path + '/term.txt', "w") as output:
        for row in asp:
            output.write(str(row) + '\n')

    with open(path + '/label.txt', "w") as output:
        for row in lbl:
            output.write(str(row) + '\n')
            
    with open(path + '/dates.txt', "w") as output:
        for row in dates:
            output.write(str(row) + '\n')

In [15]:
def get_cand(yr):
    cand1 = ''
    cand2 = ''    
        
    if yr == '2012':
        cand1 = 'obama'
        cand2 = 'romney'
    elif yr == '2016':
        cand1 = 'hillary'
        cand2 = 'trump'
    else:
        cand1 = 'biden'
        cand2 = 'trump'
        
    return cand1, cand2

In [16]:
def generate_training_data(src_path, t_path, cand):
    text_data, dt_data = get_election_data(src_path)
    txtdata, aspects, labels, positions, dates = position_aspects(text_data, dt_data, cand)
    write_data(txtdata, positions, aspects, labels, dates, t_path)

In [17]:
def generate_and_transfer_all():
    
    for yr in ['2012', '2016', '2020']:
        
        cand1, cand2 = get_cand(yr)
        
        for tp in ['candidate', 'economy', 'health', 'immigration', 'environment']:
            
            path = join('twtdata','completedata', tp, yr)
            
            cwd = Path.cwd()
            
            s_path = join(path, cand1 + '.csv')
            t_path = join(cwd.parent, 'finalproject', 'data', 'qb', 'twt', tp, yr, cand1)
            
            generate_training_data(s_path, t_path, cand1)
            
            s_path = join(path, cand2 + '.csv')
            t_path = join(cwd.parent, 'finalproject', 'data', 'qb', 'twt', tp, yr, cand2)
            generate_training_data(s_path, t_path, cand2)
    

In [18]:
generate_and_transfer_all()

In [8]:
path = join('twtdata', 'input', 'candidate', '2012', 'romney')
text_data, dt_data = get_election_data(join(path, '2012_romney.csv'))
txtdata, aspects, labels, positions, dates = position_aspects(text_data, dt_data, 'romney')
print(len(txtdata))
print(len(aspects))
print(len(labels))
print(len(positions))
print(len(dates))
write_data(txtdata, positions, aspects, labels, dates, path)

8003
7659
7659
7659
7659
7659
