In [203]:
import gzip
import json
import pandas as pd
import os.path
import numpy as np
from sklearn.model_selection import train_test_split

In [204]:
def read_data(path): 
    df = pd.read_json(path, lines = True)
    return df 

def write_file(path, file, df): 
    df = df.to_dict(orient='record')
    pathname = os.path.join(path, file)
    outfile = open(pathname, 'w')
    for instance in df:
        outfile.write(json.dumps(instance) + '\n')
    outfile.close()

def sampling_elements(group, k):
    if len(group) < k:
        return group
    return group.sample(k)

In [205]:
def process(df): 
    df['text'] = df['reviewText'] + ' ' + df['summary']
    df.loc[df['text'].isnull() == False] 
    int_list = []
    for i in df['overall']: 
        int_list.append(int(i))
    df['overall'] = int_list
    df.drop(df.index[df['overall'] == 3], inplace=True)
    df.loc[df['overall'] == 5, 'overall'] = 'positive'
    df.loc[df['overall'] == 4, 'overall'] = 'positive'
    df.loc[df['overall'] == 2, 'overall'] = 'negative'
    df.loc[df['overall'] == 1, 'overall'] = 'negative'
    df.loc[df['overall'] == 'negative', 'overall'] = 0
    df.loc[df['overall'] == 'positive', 'overall'] = 1
    df['sentiment'] = df['overall']
    data = {'text': df['text'], 'sentiment': df['sentiment']}
    df = pd.DataFrame(data=data)
    return df     

def get_labelled_data(df, k): 
    df = process(df)
    df = df.dropna() 
    df = df.reset_index(drop=True)
    df = df.groupby('sentiment').apply(sampling_elements, k).reset_index(drop=True)
    return df

def get_unlabelled_data(df, k): 
    df = process(df)
    df['sentiment'] = '-' 
    df = df.dropna()     
    df = df.reset_index(drop=True)
    df = df.groupby('sentiment').apply(sampling_elements, k).reset_index(drop=True)
    return df

In [206]:
def get_one_source(path1, path2, unlabelled_music): 
    df = read_data(path1)
    unlabelled = read_data(path2)
    one_source = get_labelled_data(df, 1050)
    unlabelled_source = get_unlabelled_data(unlabelled, 4000)
    one_source = pd.concat([one_source, unlabelled_source, unlabelled_music])
    return one_source

def get_two_sources(path1, path2, path3, path4, unlabelled_music): 
    df1 = read_data(path1)
    df2 = read_data(path2)
    unlabelled1 = read_data(path3)
    unlabelled2 = read_data(path4)
    first_source = get_labelled_data(df1, 525)
    second_source = get_labelled_data(df2, 525)
    unlabelled_first =  get_unlabelled_data(unlabelled1, 4000)
    unlabelled_second =  get_unlabelled_data(unlabelled2, 4000)
    two_sources =  pd.concat([first_source, second_source, unlabelled_first, unlabelled_second, unlabelled_music])
    return two_sources

def get_three_sources(path1, path2, path3, path4, path5, path6, unlabelled_target): 
    df1 = read_data(path1)
    df2 = read_data(path2)
    df3 = read_data(path3)
    unlabelled1 =  read_data(path4)
    unlabelled2 =  read_data(path5)
    unlabelled3 =  read_data(path6)
    first_source = get_labelled_data(df1, 350)
    second_source = get_labelled_data(df2, 350)
    third_source = get_labelled_data(df3, 350)
    
    unlabelled_first =  get_unlabelled_data(unlabelled1, 4000)
    unlabelled_second =  get_unlabelled_data(unlabelled2, 4000)
    unlabelled_third =  get_unlabelled_data(unlabelled2, 4000)
    
    three_sources =  pd.concat([first_source, second_source, third_source, unlabelled_first, 
                                unlabelled_second, unlabelled_third, unlabelled_target])
    return three_sources

def get_target(path1, path2): 
    target = read_data(path1)
    unlabelled = read_data(path2)
    target = target.dropna()
    target = target.groupby('sentiment').apply(sampling_elements, 2100).reset_index(drop=True)
    unlabelled['text'] = unlabelled['reviewText'] + ' ' + unlabelled['summary'] 
    unlabelled.loc[unlabelled['text'].isnull() == False] 
    unlabelled['sentiment'] = '-'
    unlabelled = unlabelled.groupby('sentiment').apply(sampling_elements, 10000).reset_index(drop=True)
    unlabelled = pd.DataFrame({'text': unlabelled['text'], 'sentiment': unlabelled['sentiment']})
    return target, unlabelled
    
    

In [207]:
labelled_music, unlabelled_music = get_target('data/raw/music.json', 'data/raw/music_unlabelled.json')

In [208]:
target_music = pd.DataFrame({'text':labelled_music['text'].iloc[::2].values, 
                       'sentiment':labelled_music['sentiment'].iloc[::2].values})

source_music = pd.DataFrame({'text':labelled_music['text'].iloc[1::2].values, 
                       'sentiment':labelled_music['sentiment'].iloc[1::2].values})

target_music = target_music.groupby('sentiment').apply(sampling_elements, 110).reset_index(drop=True)

In [209]:
unlabelled_books = read_data('data/raw/books_unlabelled.json')
unlabelled_books = get_unlabelled_data(unlabelled_books, 4000)

unlabelled_electronics = read_data('data/raw/electronics_unlabelled.json')
unlabelled_electronics = get_unlabelled_data(unlabelled_electronics, 4000)

unlabelled_pet = read_data('data/raw/pet_supplies_unlabelled.json')
unlabelled_pet = get_unlabelled_data(unlabelled_pet, 4000)

unlabelled_music = unlabelled_music.groupby('sentiment').apply(sampling_elements, 4000).reset_index(drop=True)

In [210]:
source_B = get_one_source('data/raw/books.json', 'data/raw/books_unlabelled.json', unlabelled_music)
source_E = get_one_source('data/raw/electronics.json', 'data/raw/electronics_unlabelled.json', unlabelled_music)
source_P = get_one_source('data/raw/pet_supplies.json', 'data/raw/pet_supplies_unlabelled.json', unlabelled_music)

In [211]:
source_BP = get_two_sources('data/raw/books.json', 'data/raw/pet_supplies.json', 'data/raw/books_unlabelled.json', 
                            'data/raw/pet_supplies_unlabelled.json', unlabelled_music)
source_EP = get_two_sources('data/raw/electronics.json', 'data/raw/pet_supplies.json', 
                            'data/raw/electronics_unlabelled.json',
                            'data/raw/pet_supplies_unlabelled.json', unlabelled_music)
source_BE = get_two_sources('data/raw/books.json', 'data/raw/electronics.json', 'data/raw/books_unlabelled.json', 
                            'data/raw/electronics_unlabelled.json', unlabelled_music)

In [212]:
source_BEP = get_three_sources('data/raw/books.json', 'data/raw/electronics.json', 'data/raw/pet_supplies.json', 
                               'data/raw/books_unlabelled.json', 'data/raw/electronics_unlabelled.json', 
                               'data/raw/pet_supplies_unlabelled.json', unlabelled_music)

In [213]:
write_file('data/extracted/', 'source_B.json', source_B)
write_file('data/extracted/', 'source_E.json', source_E)
write_file('data/extracted/', 'source_P.json', source_P)

write_file('data/extracted/', 'source_BP.json', source_BP)
write_file('data/extracted/', 'source_EP.json', source_EP)
write_file('data/extracted/', 'source_BE.json', source_BE)

write_file('data/extracted/', 'source_BEP.json', source_BEP)

In [214]:
write_file('data/extracted/', 'target_music.json', target_music)
write_file('data/extracted/', 'source_music.json', source_music)

-----------------