# Summission to Kaggle

This notebook consist of pipeline on generating predictions for Kaggle submissions.

In [2]:
import pickle
import warnings
import swifter
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz, process
from lightgbm import LGBMClassifier
from nltk.corpus import stopwords
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sentence_transformers import SentenceTransformer

from scipy.spatial.distance import cosine
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
stop = stopwords.words('english')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
df_clean = pd.read_csv('./data/df_clean.csv')
df_test = pd.read_csv('./data/df_test.csv')

In [72]:
X = df_clean.set_index('id').loc[:, 'sim':]
X = pd.get_dummies(X, columns=['query_len'], drop_first=True)

In [73]:
X_test = df_test.set_index('id').loc[:,'sim':]
X_test = pd.get_dummies(X_test, columns=['query_len'])

In [75]:
y = df_clean.set_index('id')['median_relevance']

In [76]:
final_train, final_test = X.align(X_test, join='left', axis=1)

In [77]:
print(final_train.shape, final_test.shape)

(10158, 728) (22513, 728)


### Dummy Classifier

In [78]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X, y)
dum_pred = dummy.predict(X_test)

In [79]:
dum_df = pd.DataFrame(dum_pred, index=X_test.index, columns=['prediction'])

In [80]:
dum_df.to_csv('./data/dummy_sumbit.csv')

### Transofmration

Normizize 'sim' and 'fuzzy' columns to allow better fitting to 

In [83]:
col = ['sim', 'fuzzy']

ct = ColumnTransformer(
    [('std_trans', StandardScaler(), col)], remainder='passthrough')
train_x = ct.fit_transform(final_train)
test_x = ct.transform(final_test)

In [84]:
lr = LogisticRegression(solver='newton-cg', class_weight='balanced', max_iter=2100, penalty='l2')
lr.fit(train_x, y)
pred_lr = lr.predict(test_x)

In [85]:
lr_df = pd.DataFrame(pred_lr, index=X_test.index, columns=['prediction'])

In [86]:
lr_df.to_csv('./data/lr_submit.csv')

In [87]:
svc = SVC(kernel='rbf', gamma=0.1, C=10, degree=2,
          decision_function_shape='ovo', class_weight='balanced').fit(train_x, y)
pred_rbf = svc.predict(test_x)

In [88]:
svc_df = pd.DataFrame(pred_rbf, index=X_test.index, columns=['prediction'])

In [89]:
svc_df.to_csv('./data/svc_submit.csv')

In [92]:
et = ExtraTreesClassifier( n_estimators=2600, min_samples_split=5, bootstrap=True,
                          min_samples_leaf=1).fit(train_x, y)
pred_et = et.predict(test_x)
et_df = pd.DataFrame(pred_et, index=X_test.index, columns=['prediction'])

In [93]:
et_df.to_csv('./data/et_submit.csv')

In [90]:
rf = RandomForestClassifier(n_estimators=1000, min_samples_split=15)
rf.fit(train_x, y)
pred_rf = rf.predict(test_x)
rf_df = pd.DataFrame(pred_rf, index=X_test.index, columns=['prediction'])

In [91]:
rf_df.to_csv('./data/rf_submit.csv')

In [94]:
xgb = XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=27)
xgb.fit(train_x, y)
pred_xgb = xgb.predict(test_x)
xgb_df = pd.DataFrame(pred_xgb, index=X_test.index, columns=['prediction'])

In [95]:
xgb_df.to_csv('./data/xgb_submit.csv')

In [96]:
lgb = LGBMClassifier(learning_rate=.03, objective='multiclass',
                     boosting_type='gbdt', n_estimators=1000, class_weight='balanced', colsample_bytree=0.8)
lgb.fit(train_x, y)
pred_lgb = lgb.predict(test_x)
lgb_df = pd.DataFrame(pred_lgb, index=X_test.index, columns=['prediction'])

In [97]:
lgb_df.to_csv('./data/lgb_submit.csv')