In [7]:
import pandas as pd
import datetime
import numpy as np

from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline

target_categories = ['Digital_Video_Games']
split_date = '2015-07-31'

In [8]:
abs_path = '/home/ssc/Entwicklung/projects/argos/arguseyes/example_pipelines/'

reviews = pd.read_csv(f'{abs_path}datasets/amazon-reviews/reviews.csv.gz', compression='gzip', index_col=0)
products = pd.read_csv(f'{abs_path}datasets/amazon-reviews/products.csv', index_col=0)
categories = pd.read_csv(f'{abs_path}datasets/amazon-reviews/categories.csv', index_col=0)
ratings = pd.read_csv(f'{abs_path}datasets/amazon-reviews/ratings.csv', index_col=0)

In [9]:
reviews = reviews[reviews.verified_purchase == 'Y']
reviews = reviews[reviews.marketplace == 'US']
reviews = reviews[reviews.review_date >= '2015-01-01']

In [10]:
reviews_with_ratings = reviews.merge(ratings, on='review_id')

In [11]:
categories_of_interest = categories[categories.category.isin(target_categories)]
products_of_interest = products.merge(left_on='category_id', right_on='id', right=categories_of_interest)

In [12]:
reviews_with_products_and_ratings = reviews_with_ratings.merge(products_of_interest, on='product_id')

reviews_with_products_and_ratings['product_title'] = \
    reviews_with_products_and_ratings['product_title'].fillna(value='')

reviews_with_products_and_ratings['review_headline'] = \
    reviews_with_products_and_ratings['review_headline'].fillna(value='')

reviews_with_products_and_ratings['review_body'] = \
    reviews_with_products_and_ratings['review_body'].fillna(value='')



reviews_with_products_and_ratings['title_and_review_text'] = \
    reviews_with_products_and_ratings.product_title + ' ' + \
    reviews_with_products_and_ratings.review_headline + ' ' + \
    reviews_with_products_and_ratings.review_body

In [13]:
train_data = reviews_with_products_and_ratings[reviews_with_products_and_ratings.review_date <= split_date]
test_data = reviews_with_products_and_ratings[reviews_with_products_and_ratings.review_date > split_date]

train_data['is_helpful'] = train_data['helpful_votes'] > 0
test_data['is_helpful'] = test_data['helpful_votes'] > 0

train_labels = label_binarize(train_data['is_helpful'], classes=[True, False])
test_labels = label_binarize(test_data['is_helpful'], classes=[True, False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['is_helpful'] = train_data['helpful_votes'] > 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['is_helpful'] = test_data['helpful_votes'] > 0


In [17]:
numerical_attributes = ['star_rating']
categorical_attributes = ['vine', 'verified_purchase', 'category_id']

feature_transformation = ColumnTransformer(transformers=[
    ('numerical_features', StandardScaler(), numerical_attributes),
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), categorical_attributes),
    ('textual_features', HashingVectorizer(ngram_range=(1, 3), n_features=100), 'title_and_review_text')
])

#param_grid = {
#    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
#}

pipeline = Pipeline([
    ('features', feature_transformation),
    ('learner', SGDClassifier(loss='log', penalty='l1', max_iter=1000))])

In [18]:
#search = GridSearchCV(pipeline, param_grid, scoring='roc_auc', cv=5, n_jobs=-1, verbose=2)
#model = search.fit(train_data, train_labels)
model = pipeline.fit(train_data, train_labels)

  return f(**kwargs)


In [19]:
model.score(test_data, test_labels)

0.8643830828381684