In [53]:
import pandas as pd
import numpy as np
import gc
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import scipy

## Read the data

In [55]:
warnings.filterwarnings('ignore')

In [56]:
# name of target variable
target = 'grades'
# list of date base features names
date_cols = ['month', 'day', 'year', 'day_of_week', 'hour', 'time_of_day', 'season']

In [87]:
# original data
train = pd.read_csv('data/train_ml.csv')
test = pd.read_csv('data/new_test_ml.csv', index_col='Unnamed: 0')

# preprocessed data
preprocessed_train = pd.read_csv("data/preprocessed_train.csv")
preprocessed_test = pd.read_csv("data/preprocessed_test.csv")

In [88]:
# drop nans

train = train.dropna()

assert train.isna().sum().sum() == 0

In [89]:
# target

y_train = train[target]

## Feature engineering

### Numerical features

In [78]:
# log the text_len and num_words features

preprocessed_train[['text_len', 'num_words']] = np.log(preprocessed_train[['text_len', 'num_words']])
preprocessed_test[['text_len', 'num_words']] = np.log(preprocessed_test[['text_len', 'num_words']])

In [79]:
# date feature dtype -> datetime

train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [80]:
def extract_date_features(data: pd.DataFrame, date_col: str):
    """
    The function extracts date based features from datetime feature

    :param data: pd.DataFrame extract features to
    :param date_col: str name datetime columns
    :return: pd.DataFrame with new features
    """
    data['month'] = data[date_col].dt.month
    data['day'] = data[date_col].dt.day
    data['year'] = data[date_col].dt.year
    data['day_of_week'] = data[date_col].dt.day_of_week
    data['hour'] = data[date_col].dt.hour
    # create time of day feature
    # 1 - 12am-05am, 2 - 06am-11am, 3 - 12pm-5pm, 4 - 6pm-11pm
    time_of_day = [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
                   3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4]
    hour_to_time_of_day = dict(zip(range(0, 24), time_of_day))
    data['time_of_day'] = data['hour'].map(hour_to_time_of_day)
    # create season feature
    # 1 - winter, 2 - spring, 3 - summer, 4 - autumn
    seasons = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]
    month_to_season = dict(zip(range(1, 13), seasons))
    data['season'] = data['month'].map(month_to_season)

    return data

In [81]:
# extract date features

train = extract_date_features(data=train, date_col='date')
test = extract_date_features(data=test, date_col='date')

In [82]:
# concat text_len & num_words with date based features

train_num = pd.concat([preprocessed_train[['text_len', 'num_words']].reset_index(drop=True),
                       train[date_cols].reset_index(drop=True)], axis=1).astype('float32')
test_num = pd.concat([preprocessed_test[['text_len', 'num_words']].reset_index(drop=True),
                      test[date_cols].reset_index(drop=True)], axis=1).astype('float32')

In [85]:
# create sparse matriсes from numerical features

train_num = scipy.sparse.csr_matrix(train_num.values)
test_num = scipy.sparse.csr_matrix(test_num.values)

### Text data transformation

In [66]:
# tfidf transformation of text data

vec = TfidfVectorizer(min_df=3, max_features=None,
                      strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                      ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)

bow_train_old = vec.fit_transform(preprocessed_train['lemmas'])
bow_test_old = vec.transform(preprocessed_test['lemmas'])

In [67]:
# CountVectorizer transformation of text data

cv = CountVectorizer()

train_cv = cv.fit_transform(preprocessed_train['lemmas'])
test_cv = cv.transform(preprocessed_test['lemmas'])

In [68]:
# CountVectorizer with ngram transformation

ngram_cv = CountVectorizer(ngram_range=(1, 2))

train_ngram_cv = ngram_cv.fit_transform(preprocessed_train['lemmas'])
test_ngram_cv = ngram_cv.transform(preprocessed_test['lemmas'])

### Join trasformed text data to numerical data

In [69]:
# tf idf
full_train_bow = scipy.sparse.hstack([bow_train_old, train_num]).tocsr()
full_test_bow = scipy.sparse.hstack([bow_test_old, test_num]).tocsr()

# countvectorizer
full_train_cv = scipy.sparse.hstack([train_cv, train_num]).tocsr()
full_test_cv = scipy.sparse.hstack([test_cv, test_num]).tocsr()

# count vectorizer with ngram
full_train_ngram = scipy.sparse.hstack([train_ngram_cv, train_num]).tocsr()
full_test_ngram = scipy.sparse.hstack([test_ngram_cv, test_num]).tocsr()

In [70]:
del train, test, preprocessed_test, preprocessed_train, bow_test_old, bow_train_old, \
    train_cv, test_cv, train_ngram_cv, test_ngram_cv, train_num, test_num

gc.collect()

20930

## Modeling

In [71]:
from lightgbm import LGBMClassifier

In [72]:
lgbm_params = {
    'n_estimators': 2000,
    'early_stopping_rounds': 100,
    'boosting_type': 'dart',
    'max_depth': 8,
    'random_state': 42,
    'bagging_fraction': 0.5,
    'feature_fraction': 0.5,
    'num_class': 5
}

In [38]:
%%time

model = LGBMClassifier(**lgbm_params)

model.fit(full_train_bow, y_train)

CPU times: user 5h 3min 16s, sys: 26min 4s, total: 5h 29min 21s
Wall time: 50min 13s


LGBMClassifier(bagging_fraction=0.5, boosting_type='dart',
               early_stopping_rounds=100, feature_fraction=0.5, max_depth=8,
               n_estimators=2000, num_class=5, random_state=42)

In [39]:
%%time

model_cv = LGBMClassifier(**lgbm_params)

model_cv.fit(full_train_cv, y_train)

CPU times: user 1h 50min 15s, sys: 7min 36s, total: 1h 57min 51s
Wall time: 17min 17s


LGBMClassifier(bagging_fraction=0.5, boosting_type='dart',
               early_stopping_rounds=100, feature_fraction=0.5, max_depth=8,
               n_estimators=2000, num_class=5, random_state=42)

In [40]:
%%time

model_ngram = LGBMClassifier(**lgbm_params)

model_ngram.fit(full_train_ngram, y_train)

CPU times: user 3h 46min 32s, sys: 12min 53s, total: 3h 59min 26s
Wall time: 35min 14s


LGBMClassifier(bagging_fraction=0.5, boosting_type='dart',
               early_stopping_rounds=100, feature_fraction=0.5, max_depth=8,
               n_estimators=2000, num_class=5, random_state=42)

In [52]:
# save the models

model.booster_.save_model('models/tfidf_model.txt', num_iteration=model.best_iteration_)
model_cv.booster_.save_model('models/cv_model.txt', num_iteration=model_cv.best_iteration_)
model_ngram.booster_.save_model('models/cv_ngram_model.txt', num_iteration=model_ngram.best_iteration_)

<lightgbm.basic.Booster at 0x7f84bb49b370>

In [41]:
# predict

p = model_ngram.predict_proba(full_test_ngram) + model_cv.predict_proba(full_test_cv) + model.predict_proba(full_test_bow)

In [42]:
# create dataframe from predictions

preds = pd.DataFrame(p, columns=[1, 2, 3, 4, 5]).idxmax(axis=1).reset_index().rename(columns={'index': 'inds', 0: 'grades'})

In [44]:
preds['grades'].value_counts()

1    12069
5     4920
2      162
4       37
3       32
Name: grades, dtype: int64

In [45]:
preds.to_csv('submission_13.csv', index=False)