In [7]:
import csv
import pandas as pd

In [5]:
with open('extra_data/sentiment-analysis-on-movie-reviews/train.tsv', 'r', encoding="utf-8-sig") as f:
    reader = csv.reader(f, delimiter="\t")
    lines = []
    current = '0'
    # Skip header
    next(reader)
    for line in reader:
        senid = line[1]
        if senid != current:
            current = senid
            lines.append(line)

In [8]:
df = pd.DataFrame(lines, columns=['pid', 'sid', 'text', 'sentiment'])

In [9]:
df.head()

Unnamed: 0,pid,sid,text,sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,64,2,"This quiet , introspective and entertaining in...",4
2,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1
3,117,4,A positively thrilling combination of ethnogra...,3
4,157,5,Aggressive self-glorification and a manipulati...,1


In [10]:
df.to_csv('extra_data/sentiment-analysis-on-movie-reviews/train_full_phrases_only.csv', sep='\t', index=False)

In [15]:
df = df.astype({'pid': 'int64', 'sid': 'int64', 'sentiment': 'int32'})

In [17]:
# Kaggle
# 0 - negative
# 1 - somewhat negative
# 2 - neutral
# 3 - somewhat positive
# 4 - positive

# Innoplexus
# 0-positive, 1-negative, 2-neutral  

mapping = {4: 0, 3: 0, 2: 2, 1: 1, 0: 1}

In [18]:
df['inno_sentiment'] = df['sentiment'].apply(lambda x: mapping[x])

In [19]:
df.head()

Unnamed: 0,pid,sid,text,sentiment,inno_sentiment
0,1,1,A series of escapades demonstrating the adage ...,1,1
1,64,2,"This quiet , introspective and entertaining in...",4,0
2,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1,1
3,117,4,A positively thrilling combination of ethnogra...,3,0
4,157,5,Aggressive self-glorification and a manipulati...,1,1


In [32]:
df_inno = df[['sid', 'text', 'inno_sentiment']].copy()

In [33]:
df_inno.head()

Unnamed: 0,sid,text,inno_sentiment
0,1,A series of escapades demonstrating the adage ...,1
1,2,"This quiet , introspective and entertaining in...",0
2,3,"Even fans of Ismail Merchant 's work , I suspe...",1
3,4,A positively thrilling combination of ethnogra...,0
4,5,Aggressive self-glorification and a manipulati...,1


In [34]:
df_inno.to_csv('extra_data/sentiment-analysis-on-movie-reviews/train_full_phrases_only_inno_format.csv', index=False, sep='\t')

In [35]:
pd.value_counts(df_inno['inno_sentiment'])

0    3602
1    3272
2    1655
Name: inno_sentiment, dtype: int64

In [36]:
inno_train = pd.read_csv('train.csv')

In [37]:
inno_train.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


In [38]:
df_inno['drug'] = 'mango'

In [39]:
df_inno.head()

Unnamed: 0,sid,text,inno_sentiment,drug
0,1,A series of escapades demonstrating the adage ...,1,mango
1,2,"This quiet , introspective and entertaining in...",0,mango
2,3,"Even fans of Ismail Merchant 's work , I suspe...",1,mango
3,4,A positively thrilling combination of ethnogra...,0,mango
4,5,Aggressive self-glorification and a manipulati...,1,mango


In [40]:
df_inno = df_inno[['sid', 'text', 'drug', 'inno_sentiment']]

In [41]:
df_inno.head()

Unnamed: 0,sid,text,drug,inno_sentiment
0,1,A series of escapades demonstrating the adage ...,mango,1
1,2,"This quiet , introspective and entertaining in...",mango,0
2,3,"Even fans of Ismail Merchant 's work , I suspe...",mango,1
3,4,A positively thrilling combination of ethnogra...,mango,0
4,5,Aggressive self-glorification and a manipulati...,mango,1


In [48]:
df_inno.columns = inno_train.columns

In [49]:
df_inno.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,1,A series of escapades demonstrating the adage ...,mango,1
1,2,"This quiet , introspective and entertaining in...",mango,0
2,3,"Even fans of Ismail Merchant 's work , I suspe...",mango,1
3,4,A positively thrilling combination of ethnogra...,mango,0
4,5,Aggressive self-glorification and a manipulati...,mango,1


In [50]:
combined = pd.concat([inno_train, df_inno], axis=0)

In [51]:
combined.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


In [53]:
pd.value_counts(combined['sentiment']) / combined.shape[0]

2    0.396871
0    0.305548
1    0.297581
Name: sentiment, dtype: float64

In [54]:
combined.to_csv('combined_train.tsv', sep='\t', index=False)

In [55]:
word = 'mango'
combined['text'] = combined.apply(lambda x: x['text'].lower().replace(x['drug'], word), axis=1)

In [56]:
combined.to_csv('combined_train_mango.tsv', sep='\t', index=False)