In [None]:
import numpy as np
import pandas as pd
import json
import gzip
import math
import nltk
import string 
import scipy
import ast
from nltk.corpus import cmudict
from nltk.corpus import stopwords
import sklearn.metrics as skmetrics
from collections import defaultdict
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from nltk.corpus import sentiwordnet as swn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import ensemble
from nltk.corpus import cmudict 

%matplotlib inline 

# Utilities 

In [None]:
# Read from gzip file
def read_gzip(filename):
    for line in gzip.open(filename):
        yield eval(line)

In [None]:
# Get (positive/negative) opinion words from corpus
def get_opinion_words(filename):
    with open(filename,'r',encoding = "ISO-8859-1") as f:
        for line in f:
            yield line
            
positive_words = set()
negative_words = set()

for pword in get_opinion_words('positive-words.txt'):
    positive_words.add(pword[:-2])
    
for nword in get_opinion_words('negative-words.txt'):
    negative_words.add(nword[:-2])

In [None]:
# Compute number of syllables for a given word
d = cmudict.dict() 
def nsyl(word):
    max_syl = 0
    if word.lower() in d:
        for syl_group in d[word.lower()]:
            tot_syl = 0
            for syl in syl_group:
                if str(syl[-1]).isdigit():
                    tot_syl += 1
            max_syl = max(max_syl,tot_syl)
    return max_syl

# Data Pruning

- Removed data points with greater than 150 votes as the test data has very few data points in that range. 
- Prepared two separate datasete one for highly votes reviews (>10) and one for reviews that recieved low votes (<10)

In [None]:
# Read data
data = pd.read_csv('Automotive.csv')
data.fillna('')

In [None]:
index = {'reviewerID':0, 'itemID':1, 'reviewerName':2, 'helpful':3, 'reviewText':4, 'rating':5, 'summary':6, 'unixReviewTime':7, 'reviewTime':8}
low_dataset = []
high_dataset = []

for line in data.values:
    line[index['helpful']] = ast.literal_eval(line[index['helpful']])
    if int(line[index['helpful']][1]) <= 150 and int(line[index['helpful']][1]) > 10 :
        high_dataset.append(line)
    elif int(line[index['helpful']][1]) <= 10 and int(line[index['helpful']][1]) >= 1 :
        low_dataset.append(line)


### Compute USER specific data

In [None]:
train_user_ratings_dict = defaultdict(list)
train_all_helpful_review = []
train_user_helpful_review_dict = defaultdict(list)
train_user_review_content_dict = defaultdict(list)

for data_point in data.values:   
    if (data_point[index['helpful']])[1] == 0:
        continue
      
    data_point[index['helpful']] = ast.literal_eval(data_point[index['helpful']])
    train_user_ratings_dict[data_point[index['reviewerID']]].append(int(data_point[index['rating']]))
    train_all_helpful_review.append(data_point[index['helpful']])
    train_user_helpful_review_dict[data_point[index['reviewerID']]].append(data_point[3])    
    train_user_review_content_dict[data_point[index['reviewerID']]].append(data_point[index['reviewText']])

### Compute ITEM specific data

In [None]:
#  Initialize ITEM specific data structures
train_items_ratings_dict = defaultdict(list)
train_user_purchased_items_dict = defaultdict(list)
for data_point in data.values:
    user = data_point[index['reviewerID']]
    item = data_point[index['itemID']]
    train_user_purchased_items_dict[user].append(item)
    train_items_ratings_dict[item].append(int(data_point[index['rating']]))

In [None]:
train_average_items_ratings_dict = {}
train_item_review_count = {}
for item in train_items_ratings_dict:
    train_average_items_ratings_dict[item] = np.mean(train_items_ratings_dict[item])
    train_item_review_count[item] = len(train_items_ratings_dict[item])
print ("Average ratings computed for " + str(len(train_average_items_ratings_dict.values())) + " items")
print ("Number of reviews computed for " + str(len(train_item_review_count.values())) + " items")

### Compute OVERALL average helpfulness

In [None]:
global_average_helpfulness = sum([x[0] for x in train_all_helpful_review]) * 1.0 / sum([
        x[1] for x in train_all_helpful_review])
print ("Average Helpfulness : %s" % global_average_helpfulness)

### USER SPECIFIC FEATURES

In [None]:
def get_user_review_experience_count_feature(user_review_text_dict):
    user_review_experience = {}
    # Compute number of reviews given by a user
    for user in user_review_text_dict:
        user_review_experience[user] = len(user_review_text_dict[user])
    return user_review_experience

In [None]:
def get_user_average_ratings_feature(user_ratings_dict, train_global_average_ratings):
    user_average_ratings = {}
    # Compute average ratings given by user or fill with global average ratings
    for user in user_ratings_dict:
        total_user_ratings = len(train_user_ratings_dict[user])
        if total_user_ratings > 0:
            user_average_ratings[user] = sum(train_user_ratings_dict[user]) * 1.0/total_user_ratings
        else:
            user_average_ratings[user] = train_global_average_ratings
    return user_average_ratings

In [None]:
def get_user_average_helpfulness_feature(train_user_helpful_review_dict, train_global_average_helpfulness):
    user_average_helpfulness = {}
    # Compute average helpfulness of users or fill with global average helpfulness values
    for user in train_user_helpful_review_dict:
        total_user_helpful_review = sum([x['outOf'] for x in train_user_helpful_review_dict[user]])
        if total_user_helpful_review > 0:
            user_average_helpfulness[user] = sum(
                [x['nHelpful'] for x in train_user_helpful_review_dict[user]]) * 1.0 / total_user_helpful_review
        else:
            user_average_helpfulness[user] = train_global_average_helpfulness

    return user_average_helpfulness

In [None]:
def get_user_rating_deviation_feature(user_ratings_dict, average_items_ratings_dict, user_purchased_items_dict):
    user_rating_deviation = {}
    for user in user_ratings_dict:
        user_rating_deviation[user] = np.mean([(user_ratings_dict[user] - average_items_ratings_dict[item])**2 
         for item in user_purchased_items_dict[user] if item in average_items_ratings_dict])
    return user_rating_deviation

In [None]:
# FEATURE : USER REVIEW EXPERIENCE 
train_user_review_experience = get_user_review_experience_count_feature(train_user_review_content_dict)
print ("Extracted user review experience for " + str(len(train_user_review_experience.values())) + " users")

### DATASET DEPENDENT FEATURES

In [None]:
def get_average_helpfulness(dataset):
    data_average_helpfulness = []
    for data_point in dataset:
        data_average_helpfulness.append(global_average_helpfulness)
    return data_average_helpfulness

In [None]:
def get_rating(dataset):
    data_ratings = []
    for data_point in dataset:
        data_ratings.append(int(data_point[index['rating']]))
    return data_ratings

In [None]:
def get_square_rating(dataset):
    data_ratings = []
    for data_point in dataset:
        data_ratings.append(int(data_point[index['rating']])**2)
    return data_ratings

In [None]:
def get_log_ratings(dataset):
    data_ratings = []
    for data_point in dataset:
        data_ratings.append(int(data_point[index['rating']]))
    data_ratings = np.array(data_ratings)
    return np.log(data_ratings.max() + 1 - data_ratings)

In [None]:
def get_helpfulness_votes(dataset):
    data_helpfulness_votes = []
    for data_point in dataset:
        #line[3] = ast.literal_eval(line[3])
        if data_point[index['helpful']] is not list:
            votes_json = json.loads(str(data_point[index['helpful']]))
        else:
            votes_json = data_point[index['helpful']]
        data_helpfulness_votes.append(np.log(votes_json[1] + 1))
    return data_helpfulness_votes

In [None]:
def get_review_word_count(dataset):
    data_review_word_count = []
    for data_point in dataset:
        data_review_word_count.append(np.log(len(data_point[index['reviewText']].lower().split())+1))
    return data_review_word_count

In [None]:
def get_sentence_count(dataset):
    data_review_sentence_count = []
    for data_point in dataset:
        data_review_sentence_count.append(np.log(len(data_point[index['reviewText']].lower().split('.'))+1))
    return data_review_sentence_count

In [None]:
def get_review_allcaps_count(dataset):
    data_review_word_allcaps_count = []
    for data_point in dataset:
        data_review_word_allcaps_count.append(np.log(len([
                        word for word in data_point[index['reviewText']].split() if word.isupper()])+1))
    return data_review_word_allcaps_count

In [None]:
def get_review_char_count(dataset):
    data_review_char_count = []
    for data_point in dataset:
        data_review_char_count.append(np.log(sum([len(word) for word in data_point[index['reviewText']].lower().split()])+1))
    return data_review_char_count

In [None]:
def get_review_specialchar_count(dataset):
    data_review_specialchar_count = []
    for data_point in dataset:
        data_review_specialchar_count.append(len([word for word in data_point[index['reviewText']].lower().split() 
                                           if "!" in word or "?" in word ]))
    return data_review_specialchar_count

In [None]:
def get_item_rating_deviation(dataset):
    data_item_rating_deviation = []
    for data_point in dataset:
        rating = int(data_point[index['rating']])
        score = 0
        if rating==1 or rating==5:
            rating = 10
        elif rating==2 or rating==4:
            rating = 8
        elif rating==3:
            rating = 5
        data_item_rating_deviation.append(rating)
        #data_item_rating_deviation.append(np.abs(int(data_point[index['rating']]) - train_average_items_ratings_dict[data_point[index['itemID']]]))
    return data_item_rating_deviation

In [None]:
def get_flesch_reading_ease_score(dataset):
    data_review_flesch_reading_score = []
    for data_point in dataset:
        total_words = len(data_point[index['reviewText']].lower().split())
        total_sent = len(data_point[index['reviewText']].lower().split('.'))
        total_syllable = sum([nsyl(word) for word in data_point[index['reviewText']].lower().split()])
        data_review_flesch_reading_score.append(206.835 - (1.015*(total_words* 1.0/(1+total_sent))) - \
                                                (84.6*(total_syllable * 1.0/(1+total_words))))        
    return data_review_flesch_reading_score

In [None]:
def get_readability_index(dataset):
    data_review_readability_index = []
    for data_point in dataset:
        total_char = sum([len(word) for word in data_point[index['reviewText']].lower().split()])
        total_words = len(data_point[index['reviewText']].lower().split())
        total_sent = len(data_point[index['reviewText']].lower().split('.'))
        data_review_readability_index.append((4.71*(total_char*1.0/(1+total_words))) + 
                                            (0.5*(total_words*1.0/(1+total_sent))) - 21.43)
    return data_review_readability_index

In [None]:
def get_summary_word_count(dataset):
    data_summary_word_count = []
    for data_point in dataset:
        # Sudi suggestion - normalize by review size
        data_summary_word_count.append(np.log(len([word for word in data_point[index['summary']].lower().split()])+1))
    return data_summary_word_count

In [None]:
def get_summary_char_count(dataset):
    data_summary_char_count = []
    for data_point in dataset:
        data_summary_char_count.append(np.log(sum([len(word) for word in data_point[index['summary']].lower().split()])+1))
    return data_summary_char_count

In [None]:
def get_summary_allcaps_count(dataset):
    data_summary_word_allcaps_count = []
    for data_point in dataset:
        data_summary_word_allcaps_count.append(sum([1 for word in data_point[index['summary']].split() if word.isupper()]))
    return data_summary_word_allcaps_count

In [None]:
def get_summary_specialchar_count(dataset):
    data_summary_specialchar_count = []
    for data_point in dataset:
        data_summary_specialchar_count.append(len([word for word in data_point[index['summary']].lower().split() 
                                           if "!" in word or "?" in word ]))
    return data_summary_specialchar_count

In [None]:
def get_review_sentiment_score(dataset):
    data_review_pos_sentiment = []
    data_review_neg_sentiment = []
    data_review_obj_sentiment = []
    
    for data_point in dataset:
        review_text = data_point[index['reviewText']].lower().split()
        data_review_pos_sentiment.append(sum([
                    sum([x.pos_score() for x in swn.senti_synsets(word.lower())]) for word in review_text]))
        data_review_neg_sentiment.append(sum([
                    sum([x.neg_score() for x in swn.senti_synsets(word.lower())]) for word in review_text]))
        data_review_obj_sentiment.append(sum([
                    sum([x.obj_score() for x in swn.senti_synsets(word.lower())]) for word in review_text]))
    return data_review_pos_sentiment, data_review_neg_sentiment, data_review_obj_sentiment

In [None]:
def get_summary_sentiment_score(dataset):
    data_summary_pos_sentiment = []
    data_summary_neg_sentiment = []
    data_summary_obj_sentiment = []
    for data_point in dataset:
        summary_text = data_point[index['summary']].lower().split()
        data_summary_pos_sentiment.append(sum([
                    sum([x.pos_score() for x in swn.senti_synsets(word.lower())]) for word in summary_text]))
        data_summary_neg_sentiment.append(sum([
                    sum([x.neg_score() for x in swn.senti_synsets(word.lower())]) for word in summary_text]))
        data_summary_obj_sentiment.append(sum([
                    sum([x.obj_score() for x in swn.senti_synsets(word.lower())]) for word in summary_text]))        
        
    return data_summary_pos_sentiment, data_summary_neg_sentiment, data_summary_obj_sentiment

In [None]:
def get_user_review_experience(dataset):
    data_user_review_experience = []
    for data_point in dataset:
        if data_point[index['reviewerID']] in train_user_review_experience:
            if train_user_review_experience[data_point[index['reviewerID']]] >= 5:
                data_user_review_experience.append(np.log(train_user_review_experience[data_point[index['reviewerID']]]+1))
            else:
                data_user_review_experience.append(0)
        else:
            data_user_review_experience.append(0)
    return data_user_review_experience

In [None]:
def get_category_id(dataset):
    category_0 = []
    category_1 = []
    category_2 = []
    category_3 = []
    category_4 = []

    for data_point in dataset:
        cat_id = data_point['categoryID']
        if cat_id == 0:
            category_0.append(1)
            category_1.append(0)
            category_2.append(0)
            category_3.append(0)
            category_4.append(0)
        if cat_id == 1:
            category_0.append(0)
            category_1.append(1)
            category_2.append(0)
            category_3.append(0)
            category_4.append(0)
        if cat_id == 2:
            category_0.append(0)
            category_1.append(0)
            category_2.append(1)
            category_3.append(0)
            category_4.append(0)
        if cat_id == 3:
            category_0.append(0)
            category_1.append(0)
            category_2.append(0)
            category_3.append(1)
            category_4.append(0)
        if cat_id == 4:
            category_0.append(0)
            category_1.append(0)
            category_2.append(0)
            category_3.append(0)
            category_4.append(1)
    return category_0, category_1, category_2, category_3, category_4

In [None]:
def get_review_year(dataset):
    review_years = []
    for data_point in dataset:
        year = int(data_point[index['reviewTime']].split(',')[-1].strip())
        review_years.append(year - 2005)
    return review_years

In [None]:
def get_user_average_ratings(dataset):
    user_average_ratings = []
    for data_point in dataset:
        if data_point[index['reviewerID']] in train_user_ratings:
            user_average_ratings.append(train_user_ratings[data_point[index['reviewerID']]])
        else:
            user_average_ratings.append(train_global_average_ratings)
    return user_average_ratings

In [None]:
def get_review_stopwords(dataset):
    review_stopwords = []
    stop_words = set(stopwords.words('english'))
    for data_point in dataset:
        review = data_point[index['reviewText']].lower().split()
        review_stopwords.append(np.log(sum([1 if word in stop_words else 0 for word in review])+1))
    return review_stopwords

In [None]:
def get_review_non_stopwords(dataset):
    review_nonstopwords = []
    stop_words = set(stopwords.words('english'))
    for data_point in dataset:
        review = data_point[index['reviewText']].lower().split()
        review_nonstopwords.append(np.log(sum([1 if word not in stop_words else 0 for word in review])+1))
    return review_nonstopwords

In [None]:
def get_review_positive_words(dataset):
    review_positive_words = []
    for data_point in dataset:
        review = data_point[index['reviewText']].lower().split()
        review_positive_words.append(np.log(sum([1 if word in positive_words else 0 for word in review])+1))
    return review_positive_words

In [None]:
def get_review_negative_words(dataset):
    review_negative_words = []
    for data_point in dataset:
        review = data_point[index['reviewText']].lower().split()
        review_negative_words.append(np.log(sum([1 if word in negative_words else 0 for word in review])+1))
    return review_negative_words

In [None]:
def get_review_posneg_diff_words(dataset):
    review_posneg_diff_words = []
    for data_point in dataset:
        review = data_point[index['reviewText']].lower().split()
        neg = sum([1 if word in negative_words else 0 for word in review])
        pos = sum([1 if word in positive_words else 0 for word in review])
        review_posneg_diff_words.append(abs(neg-pos)+1)
        #review_posneg_diff_words.append(np.log(abs(neg-pos)+1))
    return review_posneg_diff_words

In [None]:
def get_summary_positive_words(dataset):
    review_positive_words = []
    for data_point in dataset:
        review = data_point[index['summary']].lower().split()
        review_positive_words.append(sum([1 if word in positive_words else 0 for word in review])+1)
        #review_positive_words.append(np.log(sum([1 if word in positive_words else 0 for word in review])+1))
    return review_positive_words

In [None]:
def get_summary_negative_words(dataset):
    review_negative_words = []
    for data_point in dataset:
        review = data_point[index['summary']].lower().split()
        review_negative_words.append(sum([1 if word in negative_words else 0 for word in review])+1)
        #review_negative_words.append(np.log(sum([1 if word in negative_words else 0 for word in review])+1))
    return review_negative_words

In [None]:
def get_summary_posneg_diff_words(dataset):
    review_posneg_diff_words = []
    for data_point in dataset:
        review = data_point[index['summary']].lower().split()
        neg = sum([1 if word in negative_words else 0 for word in review])
        pos = sum([1 if word in positive_words else 0 for word in review])
        review_posneg_diff_words.append(abs(neg-pos)+1)
        #review_posneg_diff_words.append(np.log(abs(neg-pos)+1))
    return review_posneg_diff_words

In [None]:
def get_item_review_count(dataset):
    item_review_count = []
    for data_point in dataset:
        if data_point[index['itemID']] in train_item_review_count:
            item_review_count.append(train_item_review_count[data_point[index['itemID']]])
        else:
            item_review_count.append(0)
    return item_review_count

In [None]:
def get_rating_category(dataset):
    bad = []
    ok = []
    good = []
    
    for data_point in dataset:
        r = data_point[index['rating']]
        if r < 2.0:
            bad.append(1)
            ok.append(0)
            good.append(0)
        if r >= 2.0 and r< 4.0:
            bad.append(0)
            ok.append(1)
            good.append(0)
        if r >= 4.0:
            bad.append(0)
            ok.append(0)
            good.append(1)
    return bad, ok, good

In [None]:
def get_outof_group(dataset):
    outof_low = []
    outof_mid1 = []
    outof_mid2 = []
    outof_high = []
    for data_point in dataset:
        out_of = int((data_point[index['helpful']])[1])
        if out_of < 10:
            outof_low.append(1)
            outof_mid1.append(0)
            outof_mid2.append(0)
            outof_high.append(0)
            
        elif out_of < 40:
            outof_low.append(0)
            outof_mid1.append(2)
            outof_mid2.append(0)
            outof_high.append(0)
        
        elif out_of < 80:
            outof_low.append(0)
            outof_mid1.append(0)
            outof_mid2.append(3)
            outof_high.append(0)
            
        elif out_of >= 80:
            outof_low.append(0)
            outof_mid1.append(0)
            outof_mid2.append(0)
            outof_high.append(4)
            
    return outof_low, outof_mid1, outof_mid2, outof_high

In [None]:
def get_unixtime(dataset):
    unixTime = []
    user_unix_time = []
    for data_point in dataset:
        unixTime.append(data_point[index['unixReviewTime']])
    
    max_unix = max(unixTime)
    min_unix = min(unixTime)
    
    for data_point in dataset:
        user_unix_time.append(np.log(max_unix - data_point[index['unixReviewTime']] + 1))
    return unixTime

In [None]:
def get_ideal_vector(high_dataset, low_dataset):
    dataset = np.concatenate((high_dataset, low_dataset))
    reviews = dataset[:,index['reviewText']]
    vec = TfidfVectorizer(min_df=0.02, ngram_range=(1,3), analyzer='word', stop_words=stopwords.words('english'))
    vec.fit(reviews)
    reviews = vec.transform(reviews)
    reviews = reviews.toarray()
    ideal_vector = [0.]*reviews.shape[1]
    for vector in reviews:
        ideal_vector = np.add(ideal_vector, vector)

    ideal_vector = np.divide(ideal_vector, reviews.shape[0])
    return vec, ideal_vector

In [None]:
def get_cosine_similarity_review(dataset, tf_idf_vectoriser, ideal_vector):
    cosine_similarity_score = []
    for data_point in dataset:
        review = data_point[index['reviewText']]
        review = tf_idf_vectoriser.transform([review])
        review = review.toarray()
        similarity_score = float(np.linalg.norm(np.dot(ideal_vector, review[0]))/(np.linalg.norm(ideal_vector)*np.linalg.norm(review)))
    
        if np.isnan(similarity_score):
            #print(data_point[inde])
            similarity_score = 0.0
        cosine_similarity_score.append(similarity_score)
    print(cosine_similarity_score)
    return cosine_similarity_score

In [None]:
# Calculate Ideal Vector
tf_idf_vectoriser, ideal_vector = get_ideal_vector(high_dataset, low_dataset)

# Helpfulness Prediction Features

### Prepare Training Feature Set for Prediction

In [None]:
def get_features(dataset):

    ratings = get_rating(dataset)
    print ("Ratings extracted.." + str(np.matrix(ratings).shape))

    square_ratings = get_square_rating(dataset)
    print ("Squared Ratings extracted.. " + str(np.matrix(square_ratings).shape))

    total_helpfulness_votes = get_helpfulness_votes(dataset)
    print ("Helpfulness extracted.."+ str(np.matrix(total_helpfulness_votes).shape))

    review_word_count = get_review_word_count(dataset)
    print ("Review word count extracted.."+ str(np.matrix(review_word_count).shape))
    
    review_sentence_count = get_sentence_count(dataset)
    print ("Review Sentence count extracted.."+ str(np.matrix(review_sentence_count).shape))
    
    review_word_allcaps_count = get_review_allcaps_count(dataset)
    print ("Review word all caps extracted.."+ str(np.matrix(review_word_allcaps_count).shape))
    
    review_char_count = get_review_char_count(dataset)
    print ("Review character count extracted.."+ str(np.matrix(review_char_count).shape))

    item_rating_deviation = get_item_rating_deviation(dataset)
    print ("Item rating deviation extracted.."+ str(np.matrix(item_rating_deviation).shape))
    
    summary_word_count = get_summary_word_count(dataset)
    print ("Summary word count extracted.."+ str(np.matrix(summary_word_count).shape))

    summary_word_allcaps_count = get_summary_allcaps_count(dataset)
    print ("Summary word all caps extracted.."+ str(np.matrix(summary_word_allcaps_count).shape))

    user_review_experience = get_user_review_experience(dataset)
    print ("User review experience extracted.." + str(np.matrix(user_review_experience).shape))

    review_readability_index = get_readability_index(dataset)
    print ("Review readability score extracted.."+ str(np.matrix(review_readability_index).shape))
    
    review_posneg_diff = get_review_posneg_diff_words(dataset)
    print ("Review positive-negative difference extracted.." + str(np.matrix(review_posneg_diff).shape))
    
    review_stopwords = get_review_stopwords(dataset)
    print ("Review stop words extracted.. " + str(np.matrix(review_stopwords).shape))
    
    summary_neg_words = get_summary_negative_words(dataset)
    print ("Summary negative words extracted.." + str(np.matrix(summary_neg_words).shape))

    summary_specialchar_count = get_summary_specialchar_count(dataset)
    print ("Summary special character count extracted.."+ str(np.matrix(summary_specialchar_count).shape))
        
    summary_pos_words = get_summary_positive_words(dataset)
    print ("Summary positive words extracted.." + str(np.matrix(summary_pos_words).shape))
      
    summary_posneg_words = get_summary_posneg_diff_words(dataset)
    print ("Summary posneg difference extracted.. " + str(np.matrix(summary_posneg_words).shape))

    rating_bad, rating_ok, rating_good = get_rating_category(dataset)
    print ("Extracted rating category.." + str(np.matrix(rating_bad).shape))

    review_nonstopwords = get_review_non_stopwords(dataset)
    print ("Review non-stop words extracted.." + str(np.matrix(review_nonstopwords).shape))
    
    review_year = get_review_year(dataset)
    print ("Review years extracted.." + str(np.matrix(review_year).shape))
    
    outOf_low, outOf_mid1, outOf_mid2, outOf_high = get_outof_group(dataset)
    print ("Extracted one-hot encoded outOf categories.." + str(np.matrix(outOf_low).shape))
    
    unixTime = get_unixtime(dataset)
    print ("Extracted unix time of review.." + str(np.matrix(unixTime).shape))
     
    review_pos_words = get_review_positive_words(dataset)
    print("Review positive words extracted.." + str(np.matrix(review_pos_words).shape))
    
    review_neg_words = get_review_negative_words(dataset)
    print("Review negative words extracted.." + str(np.matrix(review_neg_words).shape))
    
    data_review_flesch_reading_score = get_flesch_reading_ease_score(dataset)
    print("Review flesch reading score extracted.."+ str(np.matrix(data_review_flesch_reading_score).shape))

       
    data_pos_sentiment_score, data_neg_sentiment_score, data_obj_sentiment_score = get_review_sentiment_score(dataset)
    print("Review Sentiment scores extracted..")
    
    data_summ_pos_sentiment_score, data_summ_neg_sentiment_score, data_summ_obj_sentiment_score = get_summary_sentiment_score(dataset)
    print("Summary sentiment scores extracted..")
    
    cosine_similarity_review = get_cosine_similarity_review(dataset, tf_idf_vectoriser, ideal_vector)
    print("Cosine Similarity calculated..")

    feature_set = [
        np.ones(len(dataset)),
        ratings,
        square_ratings,
        total_helpfulness_votes,
        review_word_count,
        review_sentence_count,
        review_word_allcaps_count,
        review_char_count,
        item_rating_deviation,
        summary_word_count,
        summary_word_allcaps_count,
        user_review_experience,
        review_readability_index,
        review_posneg_diff,
        review_stopwords,
        summary_neg_words,
        summary_specialchar_count,
        outOf_low,
        outOf_mid1,
        outOf_mid2,
        outOf_high,
        rating_bad, 
        rating_ok, 
        rating_good,
        unixTime,
        review_pos_words,
        review_neg_words,
        data_review_flesch_reading_score,
        data_pos_sentiment_score,
        data_neg_sentiment_score,
        data_obj_sentiment_score,
        data_summ_pos_sentiment_score,
        data_summ_neg_sentiment_score,
        data_summ_obj_sentiment_score,
        cosine_similarity_review
    ]
    dataset = np.stack(feature_set, axis=1)
    return dataset

In [None]:
#np.random.shuffle(high_dataset)
train_high_dataset = get_features(high_dataset)
#print(train_high_dataset.shape)

In [None]:
#np.random.shuffle(low_dataset)
train_low_dataset = get_features(low_dataset)
#print(train_low_dataset.shape)

In [None]:
reduced_feature_count = 29

P = PCA(reduced_feature_count)
P.fit(train_high_dataset)
train_high_dataset = P.transform(train_high_dataset)
print(train_high_dataset.shape)

P1 = PCA(reduced_feature_count)
M = np.mean(train_low_dataset.T, axis=1)
train_low_dataset = train_low_dataset-M
P1.fit(train_low_dataset)
train_low_dataset = P1.transform(train_low_dataset)
print(train_low_dataset.shape)

In [None]:
train_high_helpfulness = []
for data_point in high_dataset:
    data_helpfulness = data_point[index['helpful']]
    train_high_helpfulness.append(data_helpfulness[0] * 1.0/data_helpfulness[1])
train_high_helpfulness = np.matrix(train_high_helpfulness).T

In [None]:
train_low_helpfulness = []
for data_point in low_dataset:
    data_helpfulness = data_point[index['helpful']]
    train_low_helpfulness.append((data_helpfulness[0] * 1.0+1)/(data_helpfulness[1]+1))

train_low_helpfulness = np.matrix(train_low_helpfulness).T
print ("Extracted helpfulness score for " + str(len(train_low_helpfulness)) + " data points")

### Split Train and Validation

In [None]:
train_high_x = train_high_dataset[:int(0.5*len(train_high_dataset))]
valid_high_x = train_high_dataset[int(0.5*len(train_high_dataset)):]
train_high_y = train_high_helpfulness[:int(0.5*len(train_high_helpfulness))]
valid_high_y = train_high_helpfulness[int(0.5*len(train_high_helpfulness)):]

print (train_high_x.shape)
print (valid_high_x.shape)
print (train_high_y.shape)
print (valid_high_y.shape)

In [None]:
train_low_x = train_low_dataset[:int(0.4*len(train_low_dataset))]
valid_low_x = train_low_dataset[int(0.4*len(train_low_dataset)):]
train_low_y = train_low_helpfulness[:int(0.4*len(train_low_helpfulness))]
valid_low_y = train_low_helpfulness[int(0.4*len(train_low_helpfulness)):]
print (train_low_x.shape)
print (valid_low_x.shape)
print (train_low_y.shape)
print (valid_low_y.shape)

### Prediction Models

In [None]:
# ElasticNet Regressor
"""
predictor_high = ElasticNet(alpha=0.09, l1_ratio=0.005)
predictor_high.fit((train_high_x), (train_high_y))
predict_high_y = predictor_high.predict((valid_high_x))
#predict_high_y = predictor_high.predict((train_high_x))"""

In [None]:
# Random Forest for high data set

rf = RandomForestRegressor(n_estimators =300, max_features = 0.9, random_state = 42)
rf.fit((train_high_x), (train_high_y))
predict_high_y = rf.predict((valid_high_x))
phigh = rf.predict(train_high_x)

In [None]:
# Gradient Boosting for low data set

params = {'n_estimators': 300, 'max_depth': 5, 'min_samples_split': 3, 'loss': 'ls', 'learning_rate': 0.1}
predictor_low = ensemble.GradientBoostingRegressor(**params)
predictor_low.fit((train_low_x), (train_low_y))
predict_low_y = predictor_low.predict((valid_low_x))
plow = predictor_low.predict(train_low_x)

In [None]:
"""# Linear Regression Model
predictor_low = linear_model.LinearRegression()
predictor_low.fit((train_low_x), (train_low_y))
predict_low_y = predictor_low.predict((valid_low_x))"""

In [None]:
"""# Polynomial Regression Model
predictor_low = svm.SVR(degree=3, C=.1, epsilon=.01)
predictor_low.fit((train_low_x), (train_low_y))
predict_low_y = predictor_low.predict((valid_low_x))"""

In [None]:
# RBF Regression Model
#cvalue = 0.1
#predictor_low = svm.SVR(C=float(cvalue))
#predictor_low.fit((train_low_x), (train_low_y))
#predict_low_y = predictor_low.predict((valid_low_x))

In [None]:
#%%timeit
# Instantiate model with 1000 decision trees
"""
rf = RandomForestRegressor(n_estimators=100, max_features=1, random_state=42, oob_score=True, min_samples_leaf=5)
rf.fit((train_low_x), (train_low_y))
predict_low_y = rf.predict((valid_low_x))
plow = rf.predict(train_low_x)
rf.oob_score_
y_oob = rf.oob_prediction_
print(y_oob)
print(cross_val_score(rf, train_low_y, y_oob))"""

### Model Evaluation

In [None]:
# Mean Absolute Error Test Data
mae_high = skmetrics.mean_absolute_error(valid_high_y, predict_high_y)
print ("Mean Absolute Error of Predictor : " + str(mae_high))

In [None]:
# Mean Absolute Error Test Data
mae_low = skmetrics.mean_absolute_error(valid_low_y, predict_low_y)
print ("Mean Absolute Error of Predictor : " + str(mae_low))

In [None]:
# Mean Absolute Error Train Data
mae_high = skmetrics.mean_absolute_error(train_high_y, phigh)
print ("Mean Absolute Error of Predictor : " + str(mae_high))

In [None]:
# Mean Absolute Error Train Data
mae_low = skmetrics.mean_absolute_error(train_low_y, plow)
print ("Mean Absolute Error of Predictor : " + str(mae_low))

# Prepare Complete Dataset for Test prediction

In [None]:
# TRAIN Linear Regression Model
#predictor = ElasticNet(alpha=0.09, l1_ratio=0.005)
#predictor.fit((train_high_dataset), (train_high_helpfulness))

predictor = RandomForestRegressor(n_estimators=300, max_features=0.9, random_state=42, oob_score=True, min_samples_leaf=5)
predictor.fit((train_high_dataset), (train_high_helpfulness))

In [None]:
params = {'n_estimators': 300, 'max_depth': 5, 'min_samples_split': 3, 'loss': 'ls'}
predictor_low = ensemble.GradientBoostingRegressor(**params)
predictor_low.fit((train_low_dataset),(train_low_helpfulness))

#predictor_low = RandomForestRegressor(n_estimators=100, max_features=1, random_state=42, oob_score=True, min_samples_leaf=5)
#predictor_low.fit((train_low_dataset), (train_low_helpfulness))

In [None]:
def test_predict(m_predictor, x_test):
    return m_predictor.predict(np.matrix(x_test))

In [None]:
test_dataset = []
data = pd.read_csv('test_data_us.csv')
for line in data.values:
    test_dataset.append(line)


In [None]:
test_feature_set = get_features(test_dataset[:])
test_feature_set = P.transform(test_feature_set)
#print(test_feature_set.shape)

=====================================================================================================================

In [None]:
predictions = open("predictions_Helpful.txt", 'w')
idx = 0 
for l in open("pairs_Helpful.txt"): 
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,i,outOf = l.strip().split('-')
    
    score = (np.abs(test_predict(predictor, test_feature_set[idx])[0]*100) + np.abs(test_predict(predictor_low, test_feature_set[idx])[0]*100))/2.0
    pred = str(score)[:4]+"%"
    
    predictions.write(u + '-' + i + '-' + str(pred) + '\n')
    idx += 1
predictions.close()