## Baseline With Only Discourse Data
### Build Raw Pipeline with 
- Simple TFIdf Vect
- LogReg Model.     
- TFIDF - Model Optimization.
- O.O.F. Inference.
- Add discourse type

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

import optuna
from functools import partial
from collections import OrderedDict
import scipy 

import warnings
warnings.filterwarnings("ignore")

In [None]:
tr_df = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")
ts_df = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
sub_df = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")

In [None]:
tr_df["dis_len"] = tr_df.discourse_text.apply(lambda x: len(str(x)))
ts_df["dis_len"] = ts_df.discourse_text.apply(lambda x: len(str(x)))

In [None]:
tr_df.head()

In [None]:
text_col = "discourse_text"
target_col = "discourse_effectiveness"
target_enc = LabelEncoder()
target_enc.fit(tr_df[target_col])

In [None]:
tr_df[target_col] = target_enc.transform(tr_df[target_col])

In [None]:
lenc = LabelEncoder()
lenc.fit(tr_df.discourse_type)

tr_df.discourse_type = lenc.transform(tr_df.discourse_type)
ts_df.discourse_type = lenc.transform(ts_df.discourse_type)

In [None]:
len(tr_df.discourse_id.unique()) == tr_df.shape[0]

In [None]:
def objective(trial, dset: pd.DataFrame, text_col: str = "discourse_text", target_col: str = "discourse_effectiveness", alpha: float = 0.1):
    """Optuna obejctive for pipeline optimization
    
    trial
    dset
    text_col
    target_col
    alpha
    """
    ngram = trial.suggest_categorical("preprocess_vect_ngram_range", [(1, 2), (1, 3), (2, 4), (2, 3), (1, 1)])
    max_feats = trial.suggest_int("preprocess_vect_max_features", 1e4, 1e6)
    lowercase = trial.suggest_categorical("preprocess_vect_lowercase", [True, False])
    stopwords = trial.suggest_categorical("preprocess_vect_stop_words", [None, "english"])
    
        
    C = trial.suggest_loguniform("model_lr_C", 0.1234, 1)
    class_weight = trial.suggest_categorical("model_lr_class_weight", ["balanced", None])
    penalty = trial.suggest_categorical("model_lr_penalty", ["elasticnet"])
    solver = trial.suggest_categorical("model_lr_solver", ["saga"])
    l1_ratio = trial.suggest_float("model_lr_l1_ratio", 0.0, 1.0)
 
    
    scores = []
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42).split(dset[target_col], dset[target_col])
    for tr_ix, vl_ix in skf:
        tr_df, vl_df = dset.iloc[tr_ix], dset.iloc[vl_ix]
        
        vect = TfidfVectorizer(ngram_range=ngram, 
                               max_features=max_feats, 
                               lowercase=lowercase,
                               stop_words=stopwords)
        vect.fit(tr_df[text_col])
        
        train_vecs = vect.transform(tr_df[text_col])
        valid_vecs = vect.transform(vl_df[text_col])
        
        train_vecs = scipy.sparse.hstack([train_vecs, scipy.sparse.csr_matrix(tr_df.discourse_type.values).T, scipy.sparse.csr_matrix(tr_df.dis_len.values).T])
        valid_vecs = scipy.sparse.hstack([valid_vecs, scipy.sparse.csr_matrix(vl_df.discourse_type.values).T, scipy.sparse.csr_matrix(vl_df.dis_len.values).T])
            
        train_y, valid_y = tr_df[target_col], vl_df[target_col]
        
        model = LogisticRegression(C=C, 
                                   class_weight=class_weight, 
                                   penalty=penalty,
                                   l1_ratio=l1_ratio,
                                   solver=solver,
                                   n_jobs=-1, max_iter=5000)
       
                
        model.fit(train_vecs, train_y)
        probas = model.predict_proba(valid_vecs)
        score = log_loss(valid_y, probas)
        
        scores.append(score)
        
    mean = np.mean(scores)
    std = np.std(score)
        
    return mean + alpha * std

def parse_logs(params: dict):
    """Parse optimizer logs for best parameters.
    """
    p = dict.fromkeys(["model", "preprocessing"])
    p["model"] = dict()
    p["preprocess"] = dict()
    p["decompose"] = dict()
    for key, value in params.items():
        identifiers = key.split("_", 2)
        hyperparam = identifiers[-1]
        #Model Space
#         if "model" in identifiers:
#             #LogisticRegression
#             if "lr" in identifiers:
#                 p["model"][hyperparam] = value
#         elif "preprocess" in identifiers:
#             if "vect" in identifiers:
#                 p["preprocess"][hyperparam] = value
        p[identifiers[0]][hyperparam] = value
    return p

In [None]:
study = optuna.create_study(direction="minimize")
objective = partial(objective, dset=tr_df)
study.optimize(objective, n_trials=30)

In [None]:
opt_params = parse_logs(study.best_params)
model_params = opt_params["model"]
preprocess_params = opt_params["preprocess"]
decompose_params = opt_params["decompose"]

In [None]:
test_preds = []
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42).split(tr_df[target_col], tr_df[target_col])
for tr_ix, vl_ix in skf:
    train_df, _ = tr_df.iloc[tr_ix], tr_df.iloc[vl_ix]
        
    vect = TfidfVectorizer(**preprocess_params)
    vect.fit(train_df[text_col])
        
    train_vecs = vect.transform(train_df[text_col])
    test_vecs = vect.transform(ts_df[text_col])
    
    train_vecs = scipy.sparse.hstack([train_vecs, scipy.sparse.csr_matrix(train_df.discourse_type.values).T, scipy.sparse.csr_matrix(train_df.dis_len.values).T])
    test_vecs = scipy.sparse.hstack([test_vecs, scipy.sparse.csr_matrix(ts_df.discourse_type.values).T, scipy.sparse.csr_matrix(ts_df.dis_len.values).T])
        
    train_y = train_df[target_col]
        
    model = LogisticRegression(**model_params, max_iter=5000)
    model.fit(train_vecs, train_y)
    probas = model.predict_proba(test_vecs)
        
    test_preds.append(probas)

In [None]:
prob_preds = np.mean(np.array(test_preds), axis=0)

In [None]:
sub = pd.DataFrame(prob_preds, columns=target_enc.classes_)
sub["discourse_id"] = ts_df["discourse_id"]
sub[sub_df.columns].to_csv("submission.csv", index=False)