In [5]:
import datetime
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

from tools import f1_score

# path
path_to_data = "../../data/"
path_to_submissions = "../../submissions/"

parameters = {
    "n_estimators": 10
}
# parameters

# load data
training = pd.read_csv(path_to_data + "training_features.txt")
testing = pd.read_csv(path_to_data + "testing_features.txt")
del training["my_index"]
del testing["my_index"]

# replace inf in shortest_path with -1
training['shortest_path'] = training['shortest_path'].replace([float('inf')], [-1])
testing['shortest_path'] = testing['shortest_path'].replace([float('inf')], [-1])

my_features_string = [
    "overlap_title",
    "date_diff",
    "common_author",
    "journal_similarity",
    "overlapping_words_abstract",
    "cosine_distance",
    "shortest_path",
    "jaccard",
    "adar",
    "preferential_attachment",
    "resource_allocation_index",
    "out_neighbors",
    "in_neighbors",
    "popularity",
    "common_neighbors"
]
my_features_index = []
my_features_dic = {}

target = 0
for i in range(len(training.columns)):
    if training.columns[i] == "target":
        target = i
    elif training.columns[i] in my_features_string:
        my_features_dic.update({len(my_features_index): training.columns[i]})
        my_features_index.append(i)

# separating features and labels
training_val = training.values
testing_val = testing.values
X_train, Y_train = training_val[:, my_features_index].astype(float), training_val[:, target].astype(int)
X_test = testing_val[:, my_features_index]

now = datetime.datetime.now()
print("date: "+str(now))
print("features: "+str(my_features_string))
print("model: Random Forest")
print("parameters:")
print(parameters)
print("cross validation:")

RF = RandomForestClassifier(n_estimators=parameters["n_estimators"])
k = 5
kf = KFold(k)
predictions = np.zeros((X_test.shape[0], k))
i = 0

for train_index, test_index in kf.split(X_train, Y_train):
    RF.fit(X_train[train_index], Y_train[train_index])
    Y_pred = RF.predict(X_train[test_index])
    Y_pred_train = RF.predict(X_train[train_index])
    predictions[:, i] = RF.predict(X_test)
    print("train: "+str(f1_score(Y_train[train_index], Y_pred_train)))
    print("test: "+str(f1_score(Y_train[test_index], Y_pred)))
    i += 1

Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)

submission = pd.DataFrame(Y_test)
submission.to_csv(
    path_or_buf=path_to_submissions+"-".join(my_features_string)+".csv",
    index=True,
    index_label="id",
    header=["category"]
)
print("kaggle score: ")

for i in range(len(RF.feature_importances_)):
    print(str(my_features_dic[i]) + ": " + str(RF.feature_importances_[i]))

date: 2018-02-16 16:16:34.322166
features: ['overlap_title', 'date_diff', 'common_author', 'journal_similarity', 'overlapping_words_abstract', 'cosine_distance', 'shortest_path', 'jaccard', 'adar', 'preferential_attachment', 'resource_allocation_index', 'out_neighbors', 'in_neighbors', 'popularity', 'common_neighbors']
model: Random Forest
parameters:
{'n_estimators': 10}
cross validation:
train: 0.9966042778250185
test: 0.9720086406139066
train: 0.9967559756127237
test: 0.9717386898461955
train: 0.9965911639381028
test: 0.9717295528568946
train: 0.9965775073031881
test: 0.9722326963394218
train: 0.9965775816654026
test: 0.9718838998969885
kaggle score: 
overlap_title: 0.01665081496613972
date_diff: 0.02190514883983991
common_author: 0.005109300600450039
journal_similarity: 0.002403034365747304
shortest_path: 0.019781629572646377
overlapping_words_abstract: 0.01535330054775155
jaccard: 0.19108201273444772
adar: 0.006316136251304461
preferential_attachment: 0.052909861150268744
resource

In [3]:
training.head()

Unnamed: 0,id1,id2,target,overlap_title,date_diff,common_author,journal_similarity,shortest_path,overlapping_words_abstract,jaccard,adar,preferential_attachment,resource_allocation_index,out_neighbors,in_neighbors,popularity
0,9510123,9502114,1,2,0,0,2,-1.0,4,0.066667,0.513898,55.0,0.142857,2.0,7.0,76.0
1,9707075,9604178,1,1,1,0,0,2.0,7,0.098039,4.320366,11388.0,0.226401,67.0,123.0,4019.0
2,9312155,9506142,0,0,-2,0,0,-1.0,6,0.0,0.0,5.0,0.0,0.0,2.0,8.0
3,9911255,302165,0,0,-4,0,0,-1.0,8,0.0,0.0,280.0,0.0,16.0,2.0,3.0
4,9701033,209076,0,0,-5,0,0,-1.0,8,0.0,0.0,168.0,0.0,0.0,2.0,1.0
