In [1]:
import pandas as pd
import numpy as np
import json
import random
import re

In [2]:
#read data
df_position_sentiment = pd.read_csv(r'../data/qb/processed/combine_aspect_position_0325_clean.csv')

In [3]:
df_position_sentiment['Asp Sentiment'].replace('0', np.nan, inplace=True)
df_position_sentiment.dropna(subset=['Asp Sentiment'], inplace=True)

In [4]:
df_position_sentiment.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Review ID,Country,Version,Rating,Date,Doc Sentiment,Asp Sentiment,Review,Aspects,AspectText,Positions
0,0,0,5464235433,Canada,20.01.5,1,1/30/20,negative,neutral,latest app update . this update is terrible ba...,app,app,79
1,1,1,5464235433,Canada,20.01.5,1,1/30/20,negative,negative,latest app update . this update is terrible ba...,updates,update,1116
2,2,2,5466985498,USA,20.01.5,5,1/30/20,positive,positive,life saver . one my favorite apps to use while...,app,apps,2831
3,3,3,5460552711,USA,20.01.4,1,1/29/20,negative,negative,worst update yet . just deleted . crashed . di...,updates,update,611
4,4,4,5460552711,USA,20.01.4,1,1/29/20,negative,negative,worst update yet . just deleted . crashed . di...,experience,deleted,2329


In [6]:
#drop reviews without aspect text
print(len(df_position_sentiment))
df_position_sentiment.dropna(subset=['AspectText'], inplace=True)
print(len(df_position_sentiment))

14821
14821


In [7]:
#Check allowed category of sentiments
df_position_sentiment['Asp Sentiment'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [9]:
df_positive = df_position_sentiment[df_position_sentiment['Rating'] == 5.0]

In [10]:
len(df_positive)

6778

In [11]:
df_negative = df_position_sentiment[df_position_sentiment['Rating'] == 1.0]

In [12]:
len(df_negative)

4657

In [13]:
(df_positive['Review'].str.len() < 250).sum()

5839

In [14]:
positive_idx = []

for index, row in df_positive.iterrows():
    positive_idx.append(index)

In [15]:
positive_aspect = df_positive.loc[positive_idx[5], :]['Asp Sentiment']
positive_aspect

'neutral'

In [16]:
#Append positive and negative reviews with various aspects 

df_mix = pd.DataFrame(columns=['Review ID', 
                                       'Country', 'Version', 
                                       'Rating', 'Date', 'Doc Sentiment', 'Asp Sentiment', 
                                       'Review', 'Aspects', 'AspectText', 'Positions'])

idx_counter = 0

for index, row in df_negative.iterrows():

    positive_aspect = df_positive.loc[positive_idx[idx_counter], :]['Asp Sentiment']
    review = str(df_positive.loc[positive_idx[idx_counter], :]['Review']) + row['Review']
    
    start_idx = review.find(str(df_positive.loc[positive_idx[idx_counter], :]['AspectText']))
          
    position = str(start_idx) + ',' + str(start_idx + len(df_positive.loc[positive_idx[idx_counter], :]['AspectText']) - 1)
    
    #positive
    df_mix = df_mix.append({
        'Review ID': row['Review ID'], 
        'Country': row['Country'], 
        'Version': row['Version'],
        'Rating': row['Rating'],
        'Date': row['Date'],            
        'Doc Sentiment': row['Doc Sentiment'],
        'Asp Sentiment' : row['Asp Sentiment'],
        'Review': review,
        'AspectText': df_positive.loc[positive_idx[idx_counter], :]['AspectText'],
        'Aspects': df_positive.loc[positive_idx[idx_counter], :]['Aspects'],
        'Positions': position
    }, ignore_index=True)
    
    start_idx = review.find(row['AspectText'])          
    position = str(start_idx) + ',' + str(start_idx + len(row['AspectText']) - 1)
    
    df_mix = df_mix.append({
        'Review ID': row['Review ID'], 
        'Country': row['Country'], 
        'Version': row['Version'],
        'Rating': row['Rating'],
        'Date': row['Date'],            
        'Doc Sentiment': row['Doc Sentiment'],
        'Asp Sentiment' : row['Asp Sentiment'],
        'Review': review,
        'AspectText': row['AspectText'],
        'Aspects': row['Aspects'],
        'Positions': position
    }, ignore_index=True)
    
    idx_counter += 1

In [17]:
df_mix.Review.map(len).max()

948

In [18]:
(df_mix['Review'].str.len() < 300).sum()

3492

In [19]:
len(df_mix)

9314

In [20]:
df_mix.head()

Unnamed: 0,Review ID,Country,Version,Rating,Date,Doc Sentiment,Asp Sentiment,Review,Aspects,AspectText,Positions
0,5464235433,Canada,20.01.5,1,1/30/20,negative,neutral,life saver . one my favorite apps to use while...,app,apps,2932
1,5464235433,Canada,20.01.5,1,1/30/20,negative,neutral,life saver . one my favorite apps to use while...,app,app,2931
2,5464235433,Canada,20.01.5,1,1/30/20,negative,negative,great app for small business owner ! . i have ...,app,app,68
3,5464235433,Canada,20.01.5,1,1/30/20,negative,negative,great app for small business owner ! . i have ...,updates,update,161166
4,5460552711,USA,20.01.4,1,1/29/20,negative,negative,"it just work . see above . in the office , on ...",app,software,196203


In [21]:
#split data in - train test and dev
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    
    perm = np.random.permutation(np.arange(len(df)))
    m = len(df)
    
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end - 1] - 1]
    validate = df.iloc[perm[train_end:validate_end - 1] - 1]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [22]:
train, validate, test = train_validate_test_split(df_mix)

In [23]:
train.head()

Unnamed: 0,Review ID,Country,Version,Rating,Date,Doc Sentiment,Asp Sentiment,Review,Aspects,AspectText,Positions
2295,1696376299,USA,5.4.3,1,7/24/17,negative,negative,love it . my entire business is conducted thro...,quality,fix asap.,-17
2430,1630708616,Canada,5.4,1,6/1/17,negative,negative,great tool for self employed small business ow...,experience,navigate,7683
6985,gp:AOqpTOFher_1rklTZnN2BKwHo6ZQ-EmST6lHeEbZEk2...,English,18.10.2,1,11/29/18,negative,negative,. quickbooks is amazing . i use it for my comp...,customer service,customer service,243258
5797,gp:AOqpTOElmPzTfQonG-5L_GVPvEvrJ2NoBf4Sj17r4Gw...,English,19.8.0.2,1,8/26/19,negative,negative,work great . business on the go ! i wish there...,quality,reinstalled,125135
824,4502852112,USA,19.7,1,7/22/19,negative,negative,nice app . work better than the online version...,app,app,57


In [24]:
len(train)

5587

### Generate files

In [25]:
def generate_input_files(d_type, data):

    #save reviews
    df_review = data[['Review']]
    df_review.to_csv(r'../data/qb/{}/review.txt'.format(d_type), header=None, index=None, sep='\n', mode='a')

    #save aspects
    df_review = data[['AspectText']]
    df_review.to_csv(r'../data/qb/{}/term.txt'.format(d_type), header=None, index=None, sep='\n', mode='a')

    #save positions
    df_review = data[['Positions']]
    df_review.to_csv(r'../data/qb/{}/position.txt'.format(d_type), header=None, index=None, sep='\n', mode='a')

    #save label
    df_review = data[['Asp Sentiment']]
    df_review.to_csv(r'../data/qb/{}/label.txt'.format(d_type), header=None, index=None, sep='\n', mode='a')

In [35]:
generate_input_files('train', train)
generate_input_files('test', test)
generate_input_files('dev', validate)