In [3]:
from collections import defaultdict
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import myfm
import pandas as pd
import json
import torch
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_pickle('usersAndSubs.pkl')
s2c = pd.read_pickle('subs2comments.pkl')
tdf = pd.read_pickle('tdf.pkl')

In [5]:
with open('s2Id.json', 'r') as fp:
    s2I = json.load(fp)

In [6]:
data.head()

Unnamed: 0,user_id,reddit_id,rating
0,1,1,1
1,1,66,1
2,1,134,1
3,1,2,1
4,1,10,1


In [7]:
s2c.head()

Unnamed: 0,reddit_id,comment
0,1,[I think it should be fixed on either UTC stan...
1,2,[Art is about the hardest thing to categorize ...
2,3,[Ask me what I think about the Wall Street Jou...
3,4,"[In Mechwarrior Online, I have begun to use a ..."
4,5,"[You are talking about the Charsi imbue, right..."


In [8]:
tdf.head()

Unnamed: 0,index,subreddit,rating,reddit_id
0,0,math,101,1
1,1,Frontend,101,3898
2,2,ethereum,101,3276
3,3,MachineLearning,104,2944
4,4,artificial,101,2279


In [9]:
s2I

{'math': 1,
 'funny': 2,
 'Borderlands': 3,
 'gamingpc': 4,
 'Diablo': 5,
 'RedditLaqueristas': 6,
 'apple': 7,
 'RedditFilmsProduction': 8,
 'AbandonedPorn': 9,
 'atheism': 10,
 'quotes': 11,
 'AskReddit': 12,
 'personalfinance': 13,
 'Animals': 14,
 'leagueoflegends': 15,
 'videos': 16,
 'trees': 17,
 'Games': 18,
 'rawdenim': 19,
 'BMW': 20,
 'Guitar': 21,
 'tf2': 22,
 'truegaming': 23,
 'swtor': 24,
 'anime': 25,
 'relationship_advice': 26,
 'TwoXChromosomes': 27,
 'AdviceAnimals': 28,
 'Minecraft': 29,
 'motorcycles': 30,
 'firefall': 31,
 'audiophile': 32,
 'SRSDiscussion': 33,
 'explainlikeimfive': 34,
 'WTF': 35,
 'linux': 36,
 'LambdaConspiracies': 37,
 'YouShouldKnow': 38,
 'fffffffuuuuuuuuuuuu': 39,
 'cars': 40,
 'TrueAskReddit': 41,
 'boardgames': 42,
 'childfree': 43,
 'business': 44,
 'starcraft': 45,
 'antisrs': 46,
 'todayilearned': 47,
 'festivals': 48,
 'askseddit': 49,
 'Guildwars2': 50,
 'melbourne': 51,
 'Military': 52,
 'technology': 53,
 'science': 54,
 'running'

In [10]:
idx = max(data['user_id']) + 1

In [None]:
tdf['user_id'] = idx

In [12]:
tdf

Unnamed: 0,index,subreddit,rating,reddit_id
0,0,math,101,1
1,1,Frontend,101,3898
2,2,ethereum,101,3276
3,3,MachineLearning,104,2944
4,4,artificial,101,2279
5,8,2007scape,104,967
6,10,umass,103,1533
7,12,conspiracy,101,255
8,13,ADHD,104,313
9,14,reactjs,102,12883


In [49]:
df_train = data.append(tdf, ignore_index=True)

In [54]:
df_train

Unnamed: 0,user_id,reddit_id,rating,index,subreddit
0,1,1,1,,
1,1,66,1,,
2,1,134,1,,
3,1,2,1,,
4,1,10,1,,
...,...,...,...,...,...
2536268,1462080,616,111,20.0,cscareerquestions
2536269,1462080,11832,101,22.0,ApplyingToCollege
2536270,1462080,4636,101,24.0,iOSProgramming
2536271,1462080,24319,101,26.0,ProgrammingBuddies


In [50]:
df_test = pd.DataFrame(np.array( [i for i in s2I.values()]), columns=["reddit_id"])
df_test['user_id'] = idx

In [51]:
df_test

Unnamed: 0,reddit_id,user_id
0,1,1462080
1,2,1462080
2,3,1462080
3,4,1462080
4,5,1462080
...,...,...
29646,29647,1462080
29647,29648,1462080
29648,29649,1462080
29649,29650,1462080


In [55]:
explanation_columns = ['user_id', 'reddit_id']

In [56]:
ohe = OneHotEncoder(handle_unknown='ignore')
X_train = ohe.fit_transform(df_train[explanation_columns])
X_test = ohe.transform(df_test[explanation_columns])
y_train = df_train.rating.values

In [57]:
fm = myfm.MyFMRegressor(rank=8, random_seed=334)

In [58]:
X_train *= 1
X_test *= 1

In [None]:
fm.fit(X_train, y_train, grouping=None, n_iter=200, n_kept_samples=195)

alpha = 0.00 w0 = 1.53 :  72%|▋

In [None]:
prediction = fm.predict(X_test)
print(prediction)

In [51]:
prediction_to_subreddit = {k+1:v for k,v in enumerate(prediction)}

In [54]:
prediction_keys = sorted(prediction_to_subreddit, key=prediction_to_subreddit.get)

In [63]:
best = prediction_keys[-30:]
worst = prediction_keys[:30]

In [60]:
I2s = {v:k for k, v in s2I.items()}

In [65]:
print([I2s[b] for b in best])

['San_Angelo', 'sweetfx', 'pwettypwinkpwincesses', 'ConspiracyMemes', 'theDS', 'theholdsteady', 'MyastheniaGravis', 'CSGOCastIt', 'MCNSA', 'wargame', 'gambling', 'euguild', 'fairphone', 'Scootering', 'AskEthics', 'dbzu', 'MyLittleMotorhead', 'trpfanfiction', 'BronyH8', 'RUGC_India', 'YouTubeGamers', 'PracticeModerating', 'FixedGearBikes', 'MeatDepartment', 'lasercutting', 'digg', 'GameTrade', 'leagueoflegends', 'relationships', 'AskReddit']


In [66]:
print([I2s[w] for w in worst])

['otherkin', 'payitforward', 'drunkenpeasants', 'ausents', 'dcpu16', 'Nbome', 'Shinecraft', 'shardsonline', 'CODAliens', 'sportsmedicine', 'thegleeproject', 'genderfuck', 'MayhemSanctum', '1984', 'clevelandcavs', 'chicagojobs', 'scorpion', 'crossdressing', 'truegaming', 'UnionHouse', 'aesthetics', 'needadvice', 'Anger', 'religiondebate', 'wowgaymers', 'visas', 'HecarimMains', 'fuckmyboss', 'FutureFight_Unions', 'Dreadfort']


In [62]:
prediction = fm.predict(tdf)

KeyboardInterrupt: 

In [74]:
tdf

Unnamed: 0,subreddit,ranking,reddit_id
0,2007scape,4,967
1,umass,3,1533
2,conspiracy,1,255
3,ADHD,4,313
4,reactjs,2,12883
5,MachineLearning,3,2944
6,wallstreetbets,3,2238
7,golang,1,7766
8,politics,1,74
9,datascience,1,2085
