Skip to content

Commit

Permalink
Cross validation
Browse files Browse the repository at this point in the history
  • Loading branch information
reo7sp committed Apr 19, 2017
1 parent e0e3eb2 commit 8e92f26
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 29 deletions.
20 changes: 5 additions & 15 deletions check.py
@@ -1,18 +1,8 @@
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr

if __name__ == '__main__':
predictions = pd.read_csv('predictions.csv', index_col=0)
true = pd.read_csv('true.csv', index_col=0)
from vk_text_likeness.check import check

for column in ['direct_likes_count', 'direct_reposts_count', 'non_direct_likes_count', 'non_direct_reposts_count']:
index = [ind for ind in predictions.index if ind in true.index]
x = predictions[column].loc[index]
y_raw = true[column]
y = [y_raw.loc[ind] for ind in index]
x = x.values
y = np.array(y)
print(column, '\t', 'rmse', '\t', np.sqrt(np.mean((x - y) ** 2)))
corr, pval = pearsonr(x, y)
print(column, '\t', 'corr', '\t', corr)
if __name__ == '__main__':
predictions_df = pd.read_csv('predictions.csv', index_col=0)
true_df = pd.read_csv('true.csv', index_col=0)
check(predictions_df, true_df)
2 changes: 2 additions & 0 deletions main.py
Expand Up @@ -8,6 +8,8 @@
access_token = os.sys.argv[2]

group_predict = GroupPredict(group_id, access_token)
group_predict.prepare()
group_predict.fit()

predictions = group_predict.predict()
with open('predictions.csv', 'w') as f:
Expand Down
24 changes: 24 additions & 0 deletions main_cv.py
@@ -0,0 +1,24 @@
import os

from sklearn.model_selection import KFold

from vk_text_likeness.check import check
from vk_text_likeness.predict_main import GroupPredict

if __name__ == '__main__':
group_id = int(os.sys.argv[1])
assert group_id > 0
access_token = os.sys.argv[2]

group_predict = GroupPredict(group_id, access_token)
group_predict.prepare()

cv = KFold(5, shuffle=True)
i = 0
for train_index, test_index in cv.split(group_predict.action_data.get_all()):
print('\nCV: iter #{}'.format(i))
group_predict.fit(train_index)
predictions_df = group_predict.predict(test_index)
true_df = group_predict.get_true(predictions_df.index)
check(predictions_df, true_df)
i += 1
19 changes: 19 additions & 0 deletions vk_text_likeness/check.py
@@ -0,0 +1,19 @@
import numpy as np
from scipy.stats.stats import pearsonr


def check(predictions_df, true_df):
index = [ind for ind in predictions_df.index if ind in true_df.index]
for column in ['direct_likes_count', 'direct_reposts_count', 'non_direct_likes_count', 'non_direct_reposts_count']:
x = predictions_df[column].loc[index]
y_raw = true_df[column]
y = [y_raw.loc[ind] for ind in index]
x = x.values
y = np.array(y)

rmse = np.sqrt(np.mean((x - y) ** 2))
corr, pval = pearsonr(x, y)

print(column)
print('\t', 'rmse:', rmse)
print('\t', 'corr:', corr)
28 changes: 18 additions & 10 deletions vk_text_likeness/predict_main.py
Expand Up @@ -21,13 +21,18 @@ def __init__(self, group_id, vk_access_token):
self.group_id = group_id
self.vk_session = vk_api.VkApi(token=vk_access_token)

def prepare(self):
print('GroupPredict.prepare for group {}'.format(self.group_id))
self._init_raw_users_data()
self._init_raw_wall_data()
self._init_raw_users_data_more()
self._init_table_users_data()
self._init_table_wall_data()
self._init_action_data()
self._init_predict_action_model()

def fit(self, indexes=None):
print('GroupPredict.fit for group {}'.format(self.group_id))
self._init_predict_action_model(indexes)
self._init_predict_stats_model()

def _init_raw_users_data(self):
Expand Down Expand Up @@ -93,18 +98,21 @@ def _init_action_data(self):

self._save_pickle('action_data.table', self.action_data.table)

def _init_predict_action_model(self):
def _init_predict_action_model(self, indexes):
self.predict_action_model = PredictActionModel(self.action_data)

self.predict_action_model.like_model = self._try_load_pickle('predict_action_model.like_model')
self.predict_action_model.repost_model = self._try_load_pickle('predict_action_model.repost_model')
if indexes is None:
self.predict_action_model.like_model = self._try_load_pickle('predict_action_model.like_model')
self.predict_action_model.repost_model = self._try_load_pickle('predict_action_model.repost_model')
self.predict_action_model.is_fitted = True

if self.predict_action_model.like_model is None or self.predict_action_model.repost_model:
if not self.predict_action_model.is_fitted:
self.predict_action_model = PredictActionModel(self.action_data)
self.predict_action_model.fit()
self.predict_action_model.fit(indexes)

self._save_pickle('predict_action_model.like_model', self.predict_action_model.like_model)
self._save_pickle('predict_action_model.repost_model', self.predict_action_model.repost_model)
if indexes is None:
self._save_pickle('predict_action_model.like_model', self.predict_action_model.like_model)
self._save_pickle('predict_action_model.repost_model', self.predict_action_model.repost_model)

def _init_predict_stats_model(self):
self.predict_stats_model = PredictStatsModel(self.predict_action_model, self.raw_users_data, self.action_data)
Expand All @@ -126,9 +134,9 @@ def _save_pickle(self, name, obj):
except IOError as e:
print('Can\'t save pickle {}:'.format(name), e)

def predict(self):
def predict(self, indexes=None):
print('GroupPredict.predict for group {}'.format(self.group_id))
return self.predict_stats_model.predict()
return self.predict_stats_model.predict(indexes)

def get_true(self, subset=None):
print('GroupPredict.get_true for group {}'.format(self.group_id))
Expand Down
14 changes: 10 additions & 4 deletions vk_text_likeness/predict_model.py
Expand Up @@ -12,17 +12,23 @@ def __init__(self, action_data):
self.action_data = action_data
self.like_model = RandomForestClassifier(n_jobs=-1)
self.repost_model = RandomForestClassifier(n_jobs=-1)
self.is_fitted = False

def fit(self):
def fit(self, indexes=None):
df = self.action_data.get_all()
if indexes is not None:
df = df.iloc[indexes]
log_method_begin()
x_df = df.drop(['user_id', 'post_id', 'is_member', 'is_liked', 'is_reposted'], axis=1)
self.like_model.fit(x_df, df['is_liked'])
self.repost_model.fit(x_df, df['is_reposted'])
self.is_fitted = True
log_method_end()

def predict(self):
def predict(self, indexes=None):
df = self.action_data.get_all()
if indexes is not None:
df = df.iloc[indexes]
log_method_begin()
x_df = df.drop(['user_id', 'post_id', 'is_member', 'is_liked', 'is_reposted'], axis=1)
pred = [df['user_id'], df['post_id'], df['is_member'], self.like_model.predict(x_df), self.repost_model.predict(x_df)]
Expand All @@ -37,15 +43,15 @@ def __init__(self, predict_action_model, raw_users_data, action_data):
self.raw_users_data = raw_users_data
self.action_data = action_data

def predict(self):
def predict(self, indexes=None):
log_method_begin()

direct_likes_count = Counter()
direct_reposts_count = Counter()
non_direct_likes_count = Counter()
non_direct_reposts_count = Counter()

pred_df = self.predict_action_model.predict()
pred_df = self.predict_action_model.predict(indexes)
for i, row in pred_df.iterrows():
if row['is_liked']:
if row['is_member']:
Expand Down

0 comments on commit 8e92f26

Please sign in to comment.