In [1]:
from jenga.tasks.reviews import VideogameReviewsTask
from jenga.corruptions.text import MissingValues, BrokenCharacters

## Predicting the helpfulness of reviews on video games

We have a dataset of weekly reviews for video games, and need to predict whether a given review will be deemed helpful by the users

In [7]:
task = VideogameReviewsTask()

Lets jump tp the first week of data

In [8]:
task.advance_current_week()

True

In [13]:
train_data = task.current_new_train_data()
train_labels = task.current_new_train_labels()

train_data[['star_rating', 'title_and_review_text']]

Unnamed: 0,star_rating,title_and_review_text
21596,5,Shin Megami Tensei: Persona 3 FES A classic A ...
21597,4,Classic USB NES Controller for PC / Mac - (Not...
21598,3,Falcon 4.0: Allied Force - PC Compatibility No...
21599,5,Star Wars: The Force Unleashed Five Stars good
21600,5,The Evil Within Five Stars Was a gift.<br />Ve...
...,...,...
29389,1,Sims 4 I'm really sad that ea ruined my favori...
29390,5,The Legend of Zelda: Majora's Mask Five Stars ...
29391,5,The Elder Scrolls V: Skyrim Legendary Edition ...
29392,4,Star Trek Voyager: Elite Force Four Stars good...


### Fit a baseline model (logistic regression) on the current train data

In [14]:
model = task.fit_baseline_model(train_data, train_labels)

### Measure its prediction quality on unseen data

In [15]:
test_data = task.current_test_data()
test_data[['star_rating', 'title_and_review_text']]

Unnamed: 0,star_rating,title_and_review_text
13864,5,Dark Souls Five Stars Grandson likes it
13865,4,GoldenEye 007 Four Stars It doesn't always sta...
13866,1,Wolfenstein: The New Order I thought it sucked...
13867,5,Far Cry 4 but the storyline is great and the c...
13868,1,Mortal Kombat X Fight Pad for Xbox One and Xbo...
...,...,...
21591,5,Need for Speed Rivals Buy it! This game is awe...
21592,5,BLADESTORM: Nightmare Bladestorm: Nightmare I ...
21593,5,Xbox 360 Black Play and Charge Kit Awesome bat...
21594,5,Turtle Beach Ear Force : Premium Xbox One Surr...


In [16]:
predicted_helpfulness = model.predict_proba(test_data)
task.score_on_current_test_data(predicted_helpfulness)

0.7903856534502519

### Investigate the effect of data corruptions on the prediction quality of our baselineline model

In [19]:
broken_characters_corruption = BrokenCharacters(column='title_and_review_text', fraction=0.8)

corrupted_test_data = broken_characters_corruption.transform(test_data)
corrupted_test_data[['star_rating', 'title_and_review_text']]

Unnamed: 0,star_rating,title_and_review_text
13864,5,Dárk Sớúls Fivé Stárs Grándsớn likés it
13865,4,GoldenEye 007 Four Stars It doesn't always sta...
13866,1,Wớlfénstéin: Thé Néw Ớrdér I thớúght it súckéd...
13867,5,Far Cry 4 but the storyline is great and the c...
13868,1,Mớrtál Kớmbát X Fight Pád fớr Xbớx Ớné ánd Xbớ...
...,...,...
21591,5,Nééd fớr Spééd Riváls Búy it! This gámé is áwé...
21592,5,BLÁDÉSTỚRM: Nightmáré Bládéstớrm: Nightmáré I ...
21593,5,Xbớx 360 Bláck Pláy ánd Chárgé Kit Áwésớmé bát...
21594,5,Túrtlé Béách Éár Fớrcé : Prémiúm Xbớx Ớné Súrr...


In [20]:
predicted_helpfulness = model.predict_proba(corrupted_test_data)
task.score_on_current_test_data(predicted_helpfulness)

0.7193418965980516

### Run a full Evaluation on the all the weeks covered in the task

In [6]:
task = VideogameReviewsTask()
while task.advance_current_week():

    print("----- Week", task.current_week(), "-----")

    train_data = task.current_accumulated_train_data()
    train_labels = task.current_accumulated_train_labels()

    model = task.fit_baseline_model(train_data, train_labels)

    test_data = task.current_test_data()
    predictions = model.predict_proba(task.current_test_data())

    print("\tAUC on test data", task.score_on_current_test_data(predictions))

    corrupted_test_data = missing_values_corruption.transform(test_data)
    predictions = model.predict_proba(corrupted_test_data)
    print("\tAUC on corrupted test data (missing values)", task.score_on_current_test_data(predictions))

    corrupted_test_data = broken_characters_corruption.transform(test_data)
    predictions = model.predict_proba(corrupted_test_data)
    print("\tAUC on corrupted test data (characters)", task.score_on_current_test_data(predictions))

----- Week 0 -----
	AUC on test data 0.7925661439396734
	AUC on corrupted test data (missing values) 0.6828580159428461
	AUC on corrupted test data (characters) 0.7259952065131646
----- Week 1 -----
	AUC on test data 0.7943801183397029
	AUC on corrupted test data (missing values) 0.6488744448215313
	AUC on corrupted test data (characters) 0.7110671231057533
----- Week 2 -----
	AUC on test data 0.8103957418631155
	AUC on corrupted test data (missing values) 0.6716894824455545
	AUC on corrupted test data (characters) 0.7271906814004152
