# Summission to Kaggle

This notebook consist of pipeline on generating predictions for Kaggle submissions.

In [1]:
import pickle
import warnings

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import swifter
from nltk.corpus import stopwords
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

stop = stopwords.words('english')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df_clean = pd.read_csv('./data/df_clean.csv')
df_test = pd.read_csv('./data/df_test.csv')

In [3]:
X = df_clean.set_index('id').loc[:, 'sim':]
X = pd.get_dummies(X, columns=['query_len'], drop_first=True)

In [4]:
X_test = df_test.set_index('id').loc[:, 'sim':]
X_test = pd.get_dummies(X_test, columns=['query_len'])

In [5]:
y = df_clean.set_index('id')['median_relevance']

In [6]:
final_train, final_test = X.align(X_test, join='left', axis=1)

In [7]:
print(final_train.shape, final_test.shape)

(10158, 837) (22513, 837)


### Dummy Classifier

In [8]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X, y)
dum_pred = dummy.predict(X_test)

In [9]:
dum_df = pd.DataFrame(dum_pred, index=X_test.index, columns=['prediction'])

In [10]:
dum_df.to_csv('./data/dummy_sumbit.csv')

### Transofmration

Normizize 'sim' and 'fuzzy' columns to allow better fitting to 

In [11]:
col = ['sim', 'fuzzy']

ct = ColumnTransformer([('std_trans', StandardScaler(), col)],
                       remainder='passthrough')
train_x = ct.fit_transform(final_train)
test_x = ct.transform(final_test)

In [12]:
lr = LogisticRegression(solver='newton-cg',
                        class_weight='balanced',
                        max_iter=2100,
                        penalty='l2')
lr.fit(train_x, y)
pred_lr = lr.predict(test_x)

In [13]:
lr_df = pd.DataFrame(pred_lr, index=X_test.index, columns=['prediction'])

In [14]:
lr_df.to_csv('./data/lr_submit.csv')

In [15]:
svc = SVC(kernel='rbf',
          gamma=0.1,
          C=10,
          degree=2,
          decision_function_shape='ovo',
          class_weight='balanced').fit(train_x, y)
pred_rbf = svc.predict(test_x)

In [16]:
svc_df = pd.DataFrame(pred_rbf, index=X_test.index, columns=['prediction'])

In [17]:
svc_df.to_csv('./data/svc_submit.csv')

In [18]:
et = ExtraTreesClassifier(n_estimators=2600,
                          min_samples_split=5,
                          bootstrap=True,
                          min_samples_leaf=1).fit(train_x, y)
pred_et = et.predict(test_x)
et_df = pd.DataFrame(pred_et, index=X_test.index, columns=['prediction'])

In [19]:
et_df.to_csv('./data/et_submit.csv')

In [20]:
rf = RandomForestClassifier(
    n_estimators=2400, min_samples_split=10)
rf.fit(train_x, y)
pred_rf = rf.predict(test_x)
rf_df = pd.DataFrame(pred_rf, index=X_test.index, columns=['prediction'])

In [21]:
rf_df.to_csv('./data/rf_submit.csv')

In [31]:
xgb = XGBClassifier(learning_rate=0.05,
                    n_estimators=1300,
                    max_depth=10,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    num_class=4,
                    seed=27)
xgb.fit(train_x, y)
pred_xgb = xgb.predict(test_x)
xgb_df = pd.DataFrame(pred_xgb, index=X_test.index, columns=['prediction'])

In [32]:
xgb_df.to_csv('./data/xgb_submit.csv')