In [3]:
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv('resources/train.csv', index_col=0)
test_df = pd.read_csv('resources/test.csv', index_col=0)
sub_df = pd.read_csv('resources/sampleSubmission.csv', index_col=0)
df.shape, test_df.shape, sub_df.shape

((50000, 801), (50000, 401), (50000, 400))

In [9]:
delta = df.iloc[:, 0]
Y = df.iloc[:, 1:401].values
X = df.iloc[:, 401:].values.reshape(-1, 20, 20)
delta.shape, Y.shape, X.shape

((50000,), (50000, 400), (50000, 20, 20))

In [23]:
from preproc import *

In [12]:
from sklearn.linear_model import LogisticRegression

# Usage example:
models = []
for i in range(1, 6):
    kernel_size = 1 + i * 2
    print(f"[{i}/5]")
    X_i = X[delta == i]
    Y_i = Y[delta == i]
    
    X_train, Y_train = prepare_data(X_i, Y_i, kernel_size=kernel_size)

    lr = LogisticRegression(random_state=42, solver='lbfgs', max_iter=300)
    lr.fit(X_train, Y_train)
    models.append(lr)

    X_test = test_df[test_df.delta == i]
    X_test = X_test.iloc[:, 1:]
    X_test_windows = df_to_windows(X_test.values, kernel_size=kernel_size)

    test_predictions = lr.predict(X_test_windows)
    test_predictions = test_predictions.reshape(-1, 400)
    sub_df[test_df.delta == i] = test_predictions

[1/5]
[2/5]
[3/5]
[4/5]
[5/5]


In [13]:
sub_df.to_csv("lr_submission.csv")  # 0.14386

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Usage example:
models = []
for i in range(1, 6):
    kernel_size = 1 + i * 2
    print(f"[{i}/5]")
    X_i = X[delta == i]
    Y_i = Y[delta == i]
    
    X_train, Y_train = prepare_data(X_i, Y_i, kernel_size=kernel_size)

    rf = RandomForestClassifier(n_estimators=15, random_state=42, verbose=True, n_jobs=-1)
    rf.fit(X_train, Y_train)
    models.append(rf)

    X_test = test_df[test_df.delta == i]
    X_test = X_test.iloc[:, 1:]
    X_test_windows = df_to_windows(X_test.values, kernel_size=kernel_size)

    test_predictions = rf.predict(X_test_windows)
    test_predictions = test_predictions.reshape(-1, 400)
    sub_df[test_df.delta == i] = test_predictions

[1/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   20.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    1.5s finished


[2/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    7.2s finished


[3/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:   10.7s finished


[4/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  3.7min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:   12.4s finished


[5/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  5.1min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:   14.2s finished


In [15]:
sub_df.to_csv("rf_submission.csv")  # 0.13300

In [18]:
from sklearn.ensemble import RandomForestClassifier

# Usage example:
models = []
for i in range(1, 6):
    kernel_size = 5
    print(f"[{i}/5]")
    X_i = X[delta == i]
    Y_i = Y[delta == i]
    
    X_train, Y_train = prepare_data(X_i, Y_i, kernel_size=kernel_size)

    rf = RandomForestClassifier(n_estimators=15, random_state=42, verbose=True, n_jobs=-1)
    rf.fit(X_train, Y_train)
    models.append(rf)

    X_test = test_df[test_df.delta == i]
    X_test = X_test.iloc[:, 1:]
    X_test_windows = df_to_windows(X_test.values, kernel_size=kernel_size)

    test_predictions = rf.predict(X_test_windows)
    test_predictions = test_predictions.reshape(-1, 400)
    sub_df[test_df.delta == i] = test_predictions

[1/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    7.0s finished


[2/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    6.4s finished


[3/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    7.7s finished


[4/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    6.2s finished


[5/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    6.3s finished


In [19]:
sub_df.to_csv("rf_submission_kernel_5.csv") # 0.13221

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Usage example:
models = []
for i in range(1, 6):
    kernel_size = 5
    print(f"[{i}/5]")
    X_i = X[delta == i]
    Y_i = Y[delta == i]
    
    X_train, Y_train = prepare_data(X_i, Y_i, kernel_size=kernel_size)

    rf = RandomForestClassifier(n_estimators=100, random_state=42, verbose=True, n_jobs=-1)
    rf.fit(X_train, Y_train)
    models.append(rf)

    X_test = test_df[test_df.delta == i]
    X_test = X_test.iloc[:, 1:]
    X_test_windows = df_to_windows(X_test.values, kernel_size=kernel_size)

    test_predictions = rf.predict(X_test_windows)
    test_predictions = test_predictions.reshape(-1, 400)
    sub_df[test_df.delta == i] = test_predictions

[1/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   45.3s finished


[2/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   22.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   47.2s finished


[3/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.7min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   39.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.2min finished


[4/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.7min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   50.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.4min finished


[5/5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   34.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.0min finished


In [21]:
sub_df.to_csv("rf_submission_kernel_5_100.csv") # 0.12946