In [1]:
from collections import namedtuple

LineStat = namedtuple('LineStat', 'source cleaned line_num total_lines_in_text is_header')

In [19]:
import csv

def load_stats(filename):
    with open(filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        # print(next(reader))
        res = []
        for row in reader:
            # print(row)
            res.append(LineStat(**row))
    return res

In [21]:
csv.field_size_limit(1000000)

lines_learn = load_stats('learn.csv')
print(f"Records in the learn set: {len(lines_learn)}")
lines_test = load_stats('test.csv')
print(f"Records in the test set: {len(lines_test)}")


Records in the learn set: 146192
Records in the test set: 62654


In [68]:
def features_and_result_from_stats(data):
    bad_chars = set('#=./<>|(){}:[];')
    x, y = [], []
    for row in data:
        yy = row.is_header == 'True'
        text = row.cleaned
        row_length = len(text)
        line_num = int(row.line_num)
        total_lines_in_text = int(row.total_lines_in_text)
        relative_pos = line_num / total_lines_in_text
        started_from_alphanum = text[0].isalnum()
        started_from_lowercase = text[0].islower()
        has_bad_chars = any((c in bad_chars) for c in text)
        is_title = text.istitle()
        xx = [row_length,
              line_num,
              total_lines_in_text,
              relative_pos,
              started_from_alphanum,
              started_from_lowercase,
              is_title,
              has_bad_chars
             ]
        x.append(xx)
        y.append(yy)
    return x, y

In [69]:
X_train, y_train = features_and_result_from_stats(lines_learn)
X_test, y_test = features_and_result_from_stats(lines_test)

In [70]:
print(X_train[:10])
print(y_train[:10])
print(X_test[:10])
print(y_test[:10])

[[3, 275, 397, 0.6926952141057935, False, False, False, False], [62, 971, 1006, 0.9652087475149106, False, False, False, True], [3, 1340, 2945, 0.45500848896434637, False, False, False, False], [10, 130, 235, 0.5531914893617021, False, False, True, True], [85, 2002, 2855, 0.7012259194395797, False, False, False, True], [85, 295, 304, 0.9703947368421053, False, False, False, True], [3, 91, 179, 0.5083798882681564, False, False, False, False], [26, 48, 467, 0.10278372591006424, True, False, False, False], [66, 179, 199, 0.8994974874371859, True, False, False, False], [156, 212, 350, 0.6057142857142858, False, False, False, True]]
[False, False, False, False, False, False, False, True, False, False]
[[63, 12, 29, 0.41379310344827586, False, False, False, True], [15, 465, 754, 0.616710875331565, False, False, False, True], [114, 544, 609, 0.8932676518883416, False, False, False, True], [65, 9, 24, 0.375, True, False, False, True], [63, 41, 60, 0.6833333333333333, False, False, False, True]

In [71]:
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier(n_estimators=100)

In [72]:
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [73]:
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1:",metrics.f1_score(y_test, y_pred))


Accuracy: 0.9751013502729275
Precision: 0.8495630461922596
Recall: 0.7163157894736842
F1: 0.7772701313535122


True