### Dependencies 

In [32]:
import gzip
import json
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from zipfile import ZipFile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

### Functions

In [83]:
def checking_NaN(file): 
    """
    Function that takes a json.giz file as input and counts the number of missing values. 
    The function searches for missing values by converting the json.giz file into a pandas dataframe. 
    """
    final_columns = ['reviewText', 'summary', 'sentiment']
    df = pd.read_json(file, lines=True)
    df = df.drop(columns = [col for col in df if col not in final_columns]) 
    for i in final_columns: 
        print(df[i].isnull().sum())

def print_json(file): 
    for line in gzip.open(file):
        review_data = json.loads(line)
        for key in review_data:
            print('"' + key +'": ' + str(review_data[key]))
        break


def processing(df):
    df = df[df['reviewText'].notnull()]
    df = df[df['sentiment'].notnull()]
    df.loc[df['sentiment'] == 'positive', 'sentiment'] = 1
    df.loc[df['sentiment'] == 'negative', 'sentiment'] = 0
    df['train'] = df['reviewText'].str.lower() + ' ' + df['summary'].str.lower()
    return df


In [101]:
def training_vectorising (source, target): 
    # convert training data to bag of words
    cv = CountVectorizer(analyzer = 'word',ngram_range=(1,2), stop_words='english')
    X_train_cv = cv.fit_transform(source['train'].values.astype('U'))
    X_test_cv = cv.transform(target['train'].values.astype('U'))
    Y_train = source['sentiment'].astype('int')
    Y_test = target['sentiment'].astype('int')
    
    # train model and generate predictions
    model = LogisticRegression(solver='lbfgs', max_iter=1000)
    model.fit(X_train_cv, Y_train)
    train_yhat = model.predict(X_train_cv)
    train_acc = accuracy_score(Y_train, train_yhat)
    test_yhat = model.predict(X_test_cv)
    test_acc = accuracy_score(Y_test, test_yhat)
    
    
    # compute f-1 score
    #score = np.round(f1_score(target['sentiment'].astype('int'), y_pred, average='micro'),4)
    #score_training = np.round(f1_score(source['sentiment'].astype('int'), y_pred, average='micro'),4)
    precision = precision_score(Y_test, test_yhat, labels=[1,2], average='micro')
    recall = recall_score(Y_test, test_yhat, labels=[1,2], average='micro')
    misclassified_samples = np.flatnonzero(Y_test != test_yhat)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("Accuracy test: ", test_acc)
    return test_acc, test_yhat

In [84]:
#Reading files as dataframes 
df_train = pd.read_json('data/classification/music_reviews_train.json.gz', lines=True)
df_dev = pd.read_json('data/classification/music_reviews_dev.json.gz', lines=True)
df_test = pd.read_json('data/classification/music_reviews_test_masked.json.gz', lines=True)
df_hardcases = pd.read_json('data/classification/music_reviews_hardcases.json', lines=True)

In [92]:
df_train = processing(df_train)
df_dev = processing(df_dev)
df_hardcases = processing(df_hardcases)

In [102]:
training_vectorising(df_train, df_dev)

Precision:  0.906874365052489
Recall:  0.9427917620137299
Accuracy test:  0.9124649859943977


(0.9124649859943977, array([0, 0, 0, ..., 1, 0, 0]))

In [105]:
cv = CountVectorizer(analyzer = 'word',ngram_range=(1,2), stop_words='english')
X_train_cv = cv.fit_transform(df_train['train'].values.astype('U'))
X_test_cv = cv.transform(df_hardcases['train'].values.astype('U'))
Y_train = df_train['sentiment'].astype('int')

# train model and generate predictions
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train_cv, Y_train)
test_yhat = model.predict(X_test_cv)

In [107]:
df_hardcases['sentiment'] = test_yhat

In [108]:
cv = CountVectorizer(analyzer = 'word',ngram_range=(1,2), stop_words='english')
X_train_cv = cv.fit_transform(df_train['train'].values.astype('U'))
X_test_cv = cv.transform(df_test['train'].values.astype('U'))
Y_train = df_train['sentiment'].astype('int')

# train model and generate predictions
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train_cv, Y_train)
test_yhat_test = model.predict(X_test_cv)

In [109]:
df_test['sentiment'] = test_yhat_test

In [111]:
#creating json document 
output = df_test.to_dict(orient='record')
outFile = open('music_reviews_test.json', 'w')
for instance in output:
    outFile.write(json.dumps(instance) + '\n')
outFile.close()

In [110]:
output = df_hardcases.to_dict(orient='record')
outFile = open('phase2_testData.json', 'w')
for instance in output:
    outFile.write(json.dumps(instance) + '\n')
outFile.close()

In [112]:
#saving in a zipfile 
zipObj = ZipFile('phase2_testData-masked.json.zip', 'w')
zipObj.write('phase2_testData.json')
zipObj.close()

In [113]:
#saving in a zipfile 
zipObj = ZipFile('music_reviews_prediction_hardcases.json.zip', 'w')
zipObj.write('music_reviews_prediction_hardcases.json')
zipObj.close()