Cross validation

reo7sp · Apr 19, 2017 · 8e92f26 · 8e92f26
1 parent e0e3eb2
commit 8e92f26
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 29 deletions.
diff --git a/check.py b/check.py
@@ -1,18 +1,8 @@
-import numpy as np
 import pandas as pd
-from scipy.stats.stats import pearsonr
 
-if __name__ == '__main__':
-    predictions = pd.read_csv('predictions.csv', index_col=0)
-    true = pd.read_csv('true.csv', index_col=0)
+from vk_text_likeness.check import check
 
-    for column in ['direct_likes_count', 'direct_reposts_count', 'non_direct_likes_count', 'non_direct_reposts_count']:
-        index = [ind for ind in predictions.index if ind in true.index]
-        x = predictions[column].loc[index]
-        y_raw = true[column]
-        y = [y_raw.loc[ind] for ind in index]
-        x = x.values
-        y = np.array(y)
-        print(column, '\t', 'rmse', '\t', np.sqrt(np.mean((x - y) ** 2)))
-        corr, pval = pearsonr(x, y)
-        print(column, '\t', 'corr', '\t', corr)
+if __name__ == '__main__':
+    predictions_df = pd.read_csv('predictions.csv', index_col=0)
+    true_df = pd.read_csv('true.csv', index_col=0)
+    check(predictions_df, true_df)
diff --git a/main.py b/main.py
@@ -8,6 +8,8 @@
     access_token = os.sys.argv[2]
 
     group_predict = GroupPredict(group_id, access_token)
+    group_predict.prepare()
+    group_predict.fit()
 
     predictions = group_predict.predict()
     with open('predictions.csv', 'w') as f:

diff --git a/main_cv.py b/main_cv.py
@@ -0,0 +1,24 @@
+import os
+
+from sklearn.model_selection import KFold
+
+from vk_text_likeness.check import check
+from vk_text_likeness.predict_main import GroupPredict
+
+if __name__ == '__main__':
+    group_id = int(os.sys.argv[1])
+    assert group_id > 0
+    access_token = os.sys.argv[2]
+
+    group_predict = GroupPredict(group_id, access_token)
+    group_predict.prepare()
+
+    cv = KFold(5, shuffle=True)
+    i = 0
+    for train_index, test_index in cv.split(group_predict.action_data.get_all()):
+        print('\nCV: iter #{}'.format(i))
+        group_predict.fit(train_index)
+        predictions_df = group_predict.predict(test_index)
+        true_df = group_predict.get_true(predictions_df.index)
+        check(predictions_df, true_df)
+        i += 1
diff --git a/vk_text_likeness/check.py b/vk_text_likeness/check.py
@@ -0,0 +1,19 @@
+import numpy as np
+from scipy.stats.stats import pearsonr
+
+
+def check(predictions_df, true_df):
+    index = [ind for ind in predictions_df.index if ind in true_df.index]
+    for column in ['direct_likes_count', 'direct_reposts_count', 'non_direct_likes_count', 'non_direct_reposts_count']:
+        x = predictions_df[column].loc[index]
+        y_raw = true_df[column]
+        y = [y_raw.loc[ind] for ind in index]
+        x = x.values
+        y = np.array(y)
+
+        rmse = np.sqrt(np.mean((x - y) ** 2))
+        corr, pval = pearsonr(x, y)
+
+        print(column)
+        print('\t', 'rmse:', rmse)
+        print('\t', 'corr:', corr)
diff --git a/vk_text_likeness/predict_main.py b/vk_text_likeness/predict_main.py
@@ -21,13 +21,18 @@ def __init__(self, group_id, vk_access_token):
         self.group_id = group_id
         self.vk_session = vk_api.VkApi(token=vk_access_token)
 
+    def prepare(self):
+        print('GroupPredict.prepare for group {}'.format(self.group_id))
         self._init_raw_users_data()
         self._init_raw_wall_data()
         self._init_raw_users_data_more()
         self._init_table_users_data()
         self._init_table_wall_data()
         self._init_action_data()
-        self._init_predict_action_model()
+
+    def fit(self, indexes=None):
+        print('GroupPredict.fit for group {}'.format(self.group_id))
+        self._init_predict_action_model(indexes)
         self._init_predict_stats_model()
 
     def _init_raw_users_data(self):
@@ -93,18 +98,21 @@ def _init_action_data(self):
 
             self._save_pickle('action_data.table', self.action_data.table)
 
-    def _init_predict_action_model(self):
+    def _init_predict_action_model(self, indexes):
         self.predict_action_model = PredictActionModel(self.action_data)
 
-        self.predict_action_model.like_model = self._try_load_pickle('predict_action_model.like_model')
-        self.predict_action_model.repost_model = self._try_load_pickle('predict_action_model.repost_model')
+        if indexes is None:
+            self.predict_action_model.like_model = self._try_load_pickle('predict_action_model.like_model')
+            self.predict_action_model.repost_model = self._try_load_pickle('predict_action_model.repost_model')
+            self.predict_action_model.is_fitted = True
 
-        if self.predict_action_model.like_model is None or self.predict_action_model.repost_model:
+        if not self.predict_action_model.is_fitted:
             self.predict_action_model = PredictActionModel(self.action_data)
-            self.predict_action_model.fit()
+            self.predict_action_model.fit(indexes)
 
-            self._save_pickle('predict_action_model.like_model', self.predict_action_model.like_model)
-            self._save_pickle('predict_action_model.repost_model', self.predict_action_model.repost_model)
+            if indexes is None:
+                self._save_pickle('predict_action_model.like_model', self.predict_action_model.like_model)
+                self._save_pickle('predict_action_model.repost_model', self.predict_action_model.repost_model)
 
     def _init_predict_stats_model(self):
         self.predict_stats_model = PredictStatsModel(self.predict_action_model, self.raw_users_data, self.action_data)
@@ -126,9 +134,9 @@ def _save_pickle(self, name, obj):
         except IOError as e:
             print('Can\'t save pickle {}:'.format(name), e)
 
-    def predict(self):
+    def predict(self, indexes=None):
         print('GroupPredict.predict for group {}'.format(self.group_id))
-        return self.predict_stats_model.predict()
+        return self.predict_stats_model.predict(indexes)
 
     def get_true(self, subset=None):
         print('GroupPredict.get_true for group {}'.format(self.group_id))

diff --git a/vk_text_likeness/predict_model.py b/vk_text_likeness/predict_model.py
@@ -12,17 +12,23 @@ def __init__(self, action_data):
         self.action_data = action_data
         self.like_model = RandomForestClassifier(n_jobs=-1)
         self.repost_model = RandomForestClassifier(n_jobs=-1)
+        self.is_fitted = False
 
-    def fit(self):
+    def fit(self, indexes=None):
         df = self.action_data.get_all()
+        if indexes is not None:
+            df = df.iloc[indexes]
         log_method_begin()
         x_df = df.drop(['user_id', 'post_id', 'is_member', 'is_liked', 'is_reposted'], axis=1)
         self.like_model.fit(x_df, df['is_liked'])
         self.repost_model.fit(x_df, df['is_reposted'])
+        self.is_fitted = True
         log_method_end()
 
-    def predict(self):
+    def predict(self, indexes=None):
         df = self.action_data.get_all()
+        if indexes is not None:
+            df = df.iloc[indexes]
         log_method_begin()
         x_df = df.drop(['user_id', 'post_id', 'is_member', 'is_liked', 'is_reposted'], axis=1)
         pred = [df['user_id'], df['post_id'], df['is_member'], self.like_model.predict(x_df), self.repost_model.predict(x_df)]
@@ -37,15 +43,15 @@ def __init__(self, predict_action_model, raw_users_data, action_data):
         self.raw_users_data = raw_users_data
         self.action_data = action_data
 
-    def predict(self):
+    def predict(self, indexes=None):
         log_method_begin()
 
         direct_likes_count = Counter()
         direct_reposts_count = Counter()
         non_direct_likes_count = Counter()
         non_direct_reposts_count = Counter()
 
-        pred_df = self.predict_action_model.predict()
+        pred_df = self.predict_action_model.predict(indexes)
         for i, row in pred_df.iterrows():
             if row['is_liked']:
                 if row['is_member']: