In [55]:
import pandas as pd
import numpy as np
import nltk
import string
from tqdm import tqdm
import json
import os

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import metrics

In [57]:
def read_data(path): 
    df = pd.read_json(path, lines = True)
    return df 

def write_file(path, file, text): 
    pathname = os.path.join(path, file)
    outfile = open(pathname, 'w')
    for instance in text:
        outfile.write(json.dumps(instance) + '\n')
    outfile.close()
    
def listToString(df, column): 
    listToString = []
    for i in df[column]:
        i = ', '.join(i)
        i = ' '.join(s for s in i.split() if not any(c.isdigit() for c in s))
        i = i.replace(',', '')
        listToString.append(i)
    return listToString

In [58]:
def training_vectorising (source, target): 
    # convert training data to bag of words
    cv = CountVectorizer(analyzer = 'word',ngram_range=(1,2), stop_words='english')
    X_train_cv = cv.fit_transform(source['train'])
    X_test_cv = cv.transform(target['train'])
    Y_train = source['sentiment'].astype('int')
    Y_test = target['sentiment'].astype('int')
    
    # train model and generate predictions
    model = LogisticRegression(solver='lbfgs', max_iter=1000)
    model.fit(X_train_cv, Y_train)
    train_yhat = model.predict(X_train_cv)
    train_acc = accuracy_score(Y_train, train_yhat)
    test_yhat = model.predict(X_test_cv)
    test_acc = accuracy_score(Y_test, test_yhat)
    
    
    # compute f-1 score
    #score = np.round(f1_score(target['sentiment'].astype('int'), y_pred, average='micro'),4)
    #score_training = np.round(f1_score(source['sentiment'].astype('int'), y_pred, average='micro'),4)
    precision = precision_score(Y_test, test_yhat, labels=[1,2], average='micro')
    recall = recall_score(Y_test, test_yhat, labels=[1,2], average='micro')
    score = f1_score(Y_test, test_yhat, average='binary')
    accuracy = test_acc
    misclassified_samples = np.flatnonzero(Y_test != test_yhat)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", score)
    print(misclassified_samples)
    #print("Accuracy test: ", test_acc)
    return accuracy


### Single Source Proporsed Method

In [59]:
source_B = read_data('data/proposed/source_B.json')
target_music_B = read_data('data/proposed/target_music_B.json')
source_B = source_B[source_B['sentiment'] != '-']
acc_B = training_vectorising(source_B, target_music_B)

Precision:  0.8899082568807339
Recall:  0.8818181818181818
F1 Score:  0.8858447488584476
[  7  16  20  44  52  53  57  63  65  74  87  94 110 118 121 136 137 148
 150 162 198 206 207 211 215]


In [60]:
source_E = read_data('data/proposed/source_E.json')
target_music_E = read_data('data/proposed/target_music_E.json')
source_E = source_E[source_E['sentiment'] != '-']
acc_E = training_vectorising(source_E, target_music_E)

Precision:  0.8260869565217391
Recall:  0.8636363636363636
F1 Score:  0.8444444444444444
[  2  12  18  20  24  28  32  39  40  42  56  63  64  73  79  87  93  98
  99 105 115 121 134 136 137 146 149 156 171 176 183 206 207 211 214]


In [61]:
source_P = read_data('data/proposed/source_P.json')
target_music_P = read_data('data/proposed/target_music_P.json')
source_P = source_P[source_P['sentiment'] != '-']
acc_P = training_vectorising(source_P, target_music_P)

Precision:  0.8673469387755102
Recall:  0.7727272727272727
F1 Score:  0.8173076923076923
[  0   1  32  34  38  39  40  59  63  79  93  94 108 115 118 119 121 134
 145 146 149 156 162 163 165 166 171 176 178 191 198 199 206 207 210 212
 214 218]


### Two Sources Proposed Method

In [62]:
source_EP = read_data('data/proposed/source_EP.json')
target_music_EP = read_data('data/proposed/target_music_EP.json')
source_EP = source_EP[source_EP['sentiment'] != '-']
acc_EP = training_vectorising(source_EP, target_music_EP)

Precision:  0.8990825688073395
Recall:  0.8909090909090909
F1 Score:  0.8949771689497716
[  0  17  18  32  40  57  78  79  94  99 101 115 121 134 146 149 156 162
 171 176 206 207 215]


In [63]:
source_BP = read_data('data/proposed/source_BP.json')
target_music_BP = read_data('data/proposed/target_music_BP.json')
source_BP = source_BP[source_BP['sentiment'] != '-']
acc_BP = training_vectorising(source_BP, target_music_BP)

Precision:  0.8348623853211009
Recall:  0.8272727272727273
F1 Score:  0.8310502283105023
[  0   2   3   7  10  14  30  32  40  41  45  63  72  78  86  93  98 105
 110 115 118 121 134 137 146 149 156 162 171 175 176 198 206 207 210 215
 218]


In [64]:
source_BE = read_data('data/proposed/source_BE.json')
target_music_BE = read_data('data/proposed/target_music_BE.json')
source_BE = source_BE[source_BE['sentiment'] != '-']
acc_BE = training_vectorising(source_BE, target_music_BE)

Precision:  0.8666666666666667
Recall:  0.8272727272727273
F1 Score:  0.8465116279069769
[  0   7  24  34  44  55  62  79  94  96  97  98  99 105 110 115 119 121
 123 134 136 137 146 149 156 160 162 176 195 206 211 214 215]


### Multiple Sources Proposed Method

In [65]:
source_BEP = read_data('data/proposed/source_BEP.json')
target_music_BEP = read_data('data/proposed/target_music_BEP.json')
source_BEP = source_BEP[source_BEP['sentiment'] != '-']
acc_BEP = training_vectorising(source_BEP, target_music_BEP)

Precision:  0.8962264150943396
Recall:  0.8636363636363636
F1 Score:  0.8796296296296295
[  0  32  39  40  48  49  57  79  93  94  99 110 114 115 121 134 146 149
 156 162 163 176 183 206 210 214]


### Single Source Baseline

In [66]:
noadaption_target = read_data('data/processed/target_music.json')
noadaption_target['train'] = listToString(noadaption_target, 'unigrams')

In [67]:
noadaption_books = read_data('data/processed/source_B.json')
noadaption_books['train'] = listToString(noadaption_books, 'unigrams')
noadaption_books = noadaption_books[noadaption_books['sentiment'] != '-']
acc_B_noadaption = training_vectorising(noadaption_books, noadaption_target)

Precision:  0.6533333333333333
Recall:  0.8909090909090909
F1 Score:  0.7538461538461538
[  0   4   5   7  11  12  14  18  19  20  21  23  24  26  28  32  34  39
  40  44  45  47  48  49  52  55  57  61  64  67  69  72  73  74  76  79
  80  84  85  87  88  89  91  92  93  94  96  98  99 105 107 108 110 136
 137 148 162 171 195 198 206 207 210 211]


In [68]:
noadaption_pet = read_data('data/processed/source_P.json')
noadaption_pet['train'] = listToString(noadaption_pet, 'unigrams')
noadaption_pet = noadaption_pet[noadaption_pet['sentiment'] != '-']
acc_P_noadaption = training_vectorising(noadaption_pet, noadaption_target)

Precision:  0.7054263565891473
Recall:  0.8272727272727273
F1 Score:  0.7615062761506277
[  0   1   2   4   8  10  12  13  16  21  23  24  26  32  34  39  40  46
  55  57  63  64  67  69  72  76  79  85  88  89  92  93  94  96  98  99
 105 108 115 121 134 136 146 152 156 162 171 176 192 195 196 198 206 207
 210 211 218]


In [69]:
noadaption_electronics = read_data('data/processed/source_E.json')
noadaption_electronics['train'] = listToString(noadaption_electronics, 'unigrams')
noadaption_electronics = noadaption_electronics[noadaption_electronics['sentiment'] != '-']
acc_E_noadaption = training_vectorising(noadaption_electronics, noadaption_target)

Precision:  0.6838235294117647
Recall:  0.8454545454545455
F1 Score:  0.7560975609756098
[  0   2   7   8  10  12  13  20  21  23  24  25  26  28  32  34  37  39
  40  48  56  57  63  64  67  72  73  74  75  76  79  85  88  89  92  93
  94  98  99 101 104 105 107 117 118 121 134 136 146 148 149 156 162 171
 183 184 185 192 198 207]


### Two Sources Baseline

In [70]:
noadaption_EP = read_data('data/processed/source_EP.json')
noadaption_EP['train'] = listToString(noadaption_EP, 'unigrams')
noadaption_EP = noadaption_EP[noadaption_EP['sentiment'] != '-']
acc_EP_noadaption = training_vectorising(noadaption_EP, noadaption_target)

Precision:  0.7165354330708661
Recall:  0.8272727272727273
F1 Score:  0.7679324894514767
[  0   2   5   8  12  13  18  21  23  26  32  34  39  40  42  55  57  63
  64  67  69  72  75  76  79  85  88  89  93  94  96  98  99 102 105 107
 119 121 134 136 146 149 156 157 162 171 176 183 192 195 198 206 207 210
 218]


In [71]:
noadaption_BE = read_data('data/processed/source_BE.json')
noadaption_BE['train'] = listToString(noadaption_BE, 'unigrams')
noadaption_BE = noadaption_BE[noadaption_BE['sentiment'] != '-']
acc_BE_noadaption = training_vectorising(noadaption_BE, noadaption_target)

Precision:  0.676923076923077
Recall:  0.8
F1 Score:  0.7333333333333334
[  0   4   5   7  11  12  21  23  24  26  28  30  34  37  39  40  43  44
  46  48  49  52  55  57  63  64  67  72  73  74  76  85  88  89  92  93
  94  96  99 104 105 107 110 121 123 134 136 137 148 150 154 156 160 162
 171 176 192 195 196 198 206 210 214 218]


In [72]:
noadaption_BP = read_data('data/processed/source_BP.json')
noadaption_BP['train'] = listToString(noadaption_BP, 'unigrams')
noadaption_BP = noadaption_BP[noadaption_BP['sentiment'] != '-']
acc_BP_noadaption = training_vectorising(noadaption_BP, noadaption_target)

Precision:  0.7101449275362319
Recall:  0.8909090909090909
F1 Score:  0.7903225806451614
[  0   2   5   7  12  16  21  23  24  26  28  30  32  34  39  40  48  49
  52  55  57  63  64  65  67  72  73  74  76  84  85  88  89  92  93  94
  96  99 105 107 123 134 136 162 171 192 196 198 207 210 211 215]


### Multiple Sources Baseline

In [73]:
noadaption_BEP = read_data('data/processed/source_BEP.json')
noadaption_BEP['train'] = listToString(noadaption_BEP, 'unigrams')
noadaption_BEP = noadaption_BEP[noadaption_BEP['sentiment'] != '-']
acc_BEP_noadaption = training_vectorising(noadaption_BEP, noadaption_target)

Precision:  0.7076923076923077
Recall:  0.8363636363636363
F1 Score:  0.7666666666666666
[  0   2   5  11  12  21  23  24  26  28  32  34  39  40  44  55  57  62
  63  64  67  69  72  73  76  79  85  88  89  91  92  93  94  96  99 104
 105 107 110 134 135 136 150 157 162 171 176 183 184 195 198 200 206 207
 210 211]


### In-domain Baseline 

In [74]:
noadaption_target = read_data('data/processed/target_music.json')
noadaption_music = read_data('data/processed/source_music.json')
noadaption_target['train'] = listToString(noadaption_target, 'unigrams')
noadaption_music['train'] = listToString(noadaption_music, 'unigrams')
noadaption_music = noadaption_music[noadaption_music['sentiment'] != '-']
acc_music_noadaption = training_vectorising(noadaption_music, noadaption_target)

  res_values = method(rvalues)


Precision:  0.7540983606557377
Recall:  0.8363636363636363
F1 Score:  0.793103448275862
[  0   7  12  16  21  23  26  32  34  39  40  52  55  57  63  64  67  72
  76  79  85  89  91  92  94  96  98  99 105 107 123 135 136 137 146 150
 157 160 162 171 175 183 195 198 202 206 207 210]


### In-domain Proposed

In [75]:
source_music = read_data('data/proposed/source_music.json')
target_music = read_data('data/proposed/target_music.json')
acc_music = training_vectorising(source_music, target_music)

Precision:  0.8738738738738738
Recall:  0.8818181818181818
F1 Score:  0.8778280542986425
[  7  17  24  28  34  37  40  52  53  57  72  94  98 105 121 136 148 156
 157 162 171 195 202 206 207 210 213]


### Saving Results 

In [76]:
model_ids_proposed = ['B', 'E', 'P', 'EP', 'BE', 'BP', 'BEP', 'In-domain']
model_ids_baseline = ['B', 'E', 'P', 'EP', 'BE', 'BP', 'BEP', 'In-domain']
accuracy_proposed = [acc_B, acc_E, acc_P, acc_EP, acc_BE, acc_BP, acc_BEP, acc_music]
accuracy_baseline = [acc_B_noadaption, acc_E_noadaption, acc_P_noadaption, acc_EP_noadaption, 
                     acc_BE_noadaption, acc_BP_noadaption, acc_BEP_noadaption, acc_music_noadaption]

In [77]:
Data = {'Proposed Model ID': model_ids_proposed, 'Baseline Model ID': model_ids_baseline, 
        'Accuracy Proposed':accuracy_proposed, 'Accuracy Baseline': accuracy_baseline}
df = pd.DataFrame(Data,columns=['Proposed Model ID','Accuracy Proposed', 'Baseline Model ID', 'Accuracy Baseline' ])
df = df.to_dict(orient='record')
write_file('results/', 'classification_experiment.json', df)

---------------

### Test 0

In [44]:
for i in source_music['train'][0:2]: 
    print(i)

shmae lot artist rioight coming song feel like act shell made good year ago sadly case nickelback welldriven hit right reason like far away saving latest single doesnt feel like cut mustard got ta somebody latest single latest record dark horse doesnt feel like anything spectacular different dissapointment honestly hoping something different chad kroger company didnt strike chord way coldplay shown recently better song nickelback future sunnier dark horse song dnot somebody sounded like one song good music much better new song strong wrong note youre also
another cheap thin cd wrapped cardboard cheap send jewel caseanother cheap thin cd wrapped cardboard yet cheap okay wanted disc


In [45]:
for i in noadaption_music['train'][0:2]: 
    print(i)

shmae lot artist rioight coming song feel like act shell made good year ago sadly case nickelback welldriven hit right reason like far away saving latest single doesnt feel like cut mustard got ta somebody latest single latest record dark horse doesnt feel like anything spectacular different dissapointment honestly hoping something different chad kroger company didnt strike chord way coldplay shown recently better song nickelback future sunnier dark horse song dnot somebody
another cheap thin cd wrapped cardboard cheap send jewel caseanother cheap thin cd wrapped cardboard


In [46]:
for i in target_music['train'][0:2]: 
    print(i)

greatgreat song tried need love least liked
rotten think dont sell song download disappointed frustratedlove song hate wont download computer system cloud amazon play


In [47]:
for i in noadaption_target['train'][0:2]: 
    print(i)

greatgreat song
rotten think dont sell song download disappointed frustratedlove song hate wont download


---------

### Test 1

NB: The code takes forever to run. It is therefore cancelled out. 
    The results of the experiment can be seen under results. 

In [283]:
#for i in range(1,8): 
#    source_BEP_test = read_data('test/source_BEP_test{}.json'.format(i))
#    target_music_BEP_test = read_data('test/target_music_BEP_test{}.json'.format(i))
#    source_BEP_test = source_BEP_test[source_BEP_test['sentiment'] != '-']
#    acc_music_multi_test = training_vectorising(source_BEP_test, target_music_BEP_test)
#    print(acc_music_multi_test)

In [282]:
#lst_accuracy = [acc_BEP_noadaption, acc_music_multi_test2, acc_music_multi_test3, acc_music_multi_test7, 
#                acc_music_multi_test1, acc_music_multi_test4, acc_music_multi_test5, acc_music_multi_test6]
#lst_size = [0, 59, 108, 587, 1197, 2231, 5171, 9665]

In [290]:
#Data = {'Glossary Size': lst_size, 'Model Accuracy': lst_accuracy}
#df = pd.DataFrame(Data,columns=['Glossary Size','Model Accuracy'])
#df = df.to_dict(orient='record')
#write_file('results/experiments', 'glossary_experiment_one.json', df)

### Test 2

NB: The code takes forever to run. It is therefore cancelled out. 
    The results of the experiment can be seen under results. 

In [53]:
#for i in range(1,9): 
#    source_BEP_anothertest = read_data('test/source_BEP_anothertest{}.json'.format(i))
#    target_music_BEP_anothertest = read_data('test/target_music_BEP_anothertest{}.json'.format(i))
#    source_BEP_anothertest = source_BEP_anothertest[source_BEP_anothertest['sentiment'] != '-']
#    acc_music_multi_anothertest = training_vectorising(source_BEP_anothertest, target_music_BEP_anothertest)
#    print(acc_music_multi_anothertest)

In [259]:
#lst_accuracy_2 = [acc_music_multi_anothertest1, acc_music_multi_anothertest2, acc_music_multi_anothertest3,
#               acc_music_multi_anothertest4, acc_music_multi_anothertest5, acc_music_multi_anothertest6, 
#               acc_music_multi_anothertest7, acc_music_multi_anothertest8]
#lst_neighbors_2 = [1, 5, 10, 20, 50, 100, 200, 500]

In [288]:
#Data = {'Neighbors Size': lst_neighbors_2, 'Model Accuracy': lst_accuracy_2}
#df = pd.DataFrame(Data,columns=['Neighbors Size','Model Accuracy'])
#df = df.to_dict(orient='record')
#write_file('results/experiments', 'glossary_experiment_two.json', df)