In [0]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from scipy import sparse
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [0]:
train = pd.read_csv('Predictive_Data/train_file.csv')
test = pd.read_csv('Predictive_Data/test_file.csv')
train.columns

Index(['ID', 'UsageClass', 'CheckoutType', 'CheckoutYear', 'CheckoutMonth',
       'Checkouts', 'Title', 'Creator', 'Subjects', 'Publisher',
       'PublicationYear', 'MaterialType'],
      dtype='object')

In [0]:
def write_submission_file(prediction, file_name):
    sub = pd.DataFrame()
    sub['ID'] = pd.read_csv('Predictive_Data/test_file.csv')['ID']
    sub['MaterialType'] = prediction
    sub.MaterialType = sub.MaterialType.astype(int).replace(numbers_mt)
    sub.to_csv(file_name, index=False, header=True)

In [0]:
train.MaterialType.value_counts()

BOOK         21707
SOUNDDISC     4149
VIDEOCASS     2751
VIDEODISC     1420
SOUNDCASS     1020
MIXED          347
MUSIC          165
CR              94
Name: MaterialType, dtype: int64

In [0]:
mt_numbers = {'BOOK':0, 'SOUNDDISC':1, 'VIDEOCASS':2, 'VIDEODISC':3, 'SOUNDCASS':4, 'MIXED':5, 'MUSIC':6, 'CR':7}
numbers_mt = {0:'BOOK', 1:'SOUNDDISC', 2:'VIDEOCASS', 3:'VIDEODISC', 4:'SOUNDCASS', 5:'MIXED', 6:'MUSIC', 7:'CR'}
train.MaterialType = train.MaterialType.replace(mt_numbers)

In [0]:
y = train.MaterialType
train = train.drop('MaterialType', axis=1)

In [0]:
f1 = make_scorer(f1_score, greater_is_better=True, average='weighted')
acc = make_scorer(accuracy_score, greater_is_better=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [0]:
train['SubjectPresent'] = train.Subjects.isna().astype(int)
test['SubjectPresent'] = test.Subjects.isna().astype(int)

train['PublisherPresent'] = train.Publisher.isna().astype(int)
test['PublisherPresent'] = test.Publisher.isna().astype(int)

train['CreatorPresent'] = train.Creator.isna().astype(int)
test['CreatorPresent'] = test.Creator.isna().astype(int)

In [0]:
train.Subjects.fillna(' ', inplace=True)
test.Subjects.fillna(' ', inplace=True)
train['text'] = train.Title +' '+ train.Subjects
test['text'] = test.Title +' '+ test.Subjects

In [0]:
train['Title_Sub'] = train.Title +' '+ train.Subjects
test['Title_Sub'] = test.Title +' '+ test.Subjects

In [0]:
def train_logit(X, y, test_df):
    logit = LogisticRegression(penalty='l1')
    cv_score = cross_val_score(logit, X, y, scoring=f1, n_jobs=-1, verbose=2, cv=skf)
    print(f'CV score : {cv_score}')
    print(f'CV mean : {cv_score.mean()}')
    train_score = f1_score(cross_val_predict(logit, X, y, cv=skf), y, average='weighted')
    print(f'Train Score : {train_score}')

In [0]:
def train_lgb(X, y, test_df):
    clf = LGBMClassifier()
    cv_score = cross_val_score(clf, X, y, scoring=f1, n_jobs=-1, verbose=2, cv=skf)
    print(f'CV score : {cv_score}')
    print(f'CV mean : {cv_score.mean()}')
    train_score = f1_score(cross_val_predict(clf, X, y, cv=skf), y, average='weighted')
    print(f'Train Score : {train_score}') 

In [0]:
def lgb_submission(X, y, test_df, file_name):
    clf = LGBMClassifier()
    clf.fit(X, y)
    prediction = clf.predict(test_df)
    write_submission_file(prediction, file_name)

In [0]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [0]:
import nltk
# nltk.download('all');

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
def clean_data(train, test):
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    
    stop = stopwords.words('english')
    st = PorterStemmer()

    train['word_count'] = train['text'].apply(lambda x: len(str(x).split(" ")))
    test['word_count'] = test['text'].apply(lambda x: len(str(x).split(" ")))
    
    train['char_count'] = train['text'].str.len()
    test['char_count'] = test['text'].str.len()
    
    train['avg_word'] = train['text'].apply(lambda x: avg_word(x))
    test['avg_word'] = test['text'].apply(lambda x: avg_word(x))
    
    train['stopwords'] = train['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
    test['stopwords'] = test['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
    
    train['text'] = train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    test['text'] = test['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    
    train['text'] = train['text'].str.replace('[^\w\s]','')
    test['text'] = test['text'].str.replace('[^\w\s]','')
    
    train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    test['text'] = test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    
    freq = pd.Series(' '.join(train['text']).split()).value_counts()[:10]
    freq = list(freq.index)
    train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    freq = pd.Series(' '.join(test['text']).split()).value_counts()[:10]
    freq = list(freq.index)
    test['text'] = test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    
    freq = pd.Series(' '.join(train['text']).split()).value_counts()[-10:]
    freq = list(freq.index)
    train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    freq = pd.Series(' '.join(test['text']).split()).value_counts()[-10:]
    freq = list(freq.index)
    test['text'] = test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

    train['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    test['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    
    return train, test

In [0]:
def make_data(train_df, test_df):
    tfidf = TfidfVectorizer()
    train_tfidf = tfidf.fit_transform(train.Title_Sub)
    test_tfidf = tfidf.transform(test.Title_Sub)
    
    X = sparse.hstack((train_tfidf, np.array(train.Checkouts)[:, None], np.array(train.PublisherPresent)[:, None], np.array(train.CreatorPresent)[:, None]))
    X = sparse.hstack((X, np.array(train_df.word_count)[:, None], np.array(train_df.char_count)[:, None], np.array(train_df.stopwords)[:, None], np.array(train_df.avg_word)[:, None]))
    test_matrix = sparse.hstack((test_tfidf, np.array(test_df.Checkouts)[:, None], np.array(test_df.PublisherPresent)[:, None], np.array(test_df.CreatorPresent)[:, None]))
    test_matrix = sparse.hstack((test_matrix, np.array(test_df.word_count)[:, None], np.array(test_df.char_count)[:, None], np.array(test_df.stopwords)[:, None], np.array(test_df.avg_word)[:, None]))
    
    return X, test_matrix

In [0]:
train_df, test_df = clean_data(train, test)

In [0]:
X_train, X_test = make_data(train_df, test_df)

In [0]:
def train_xgb(X, y):
  clf = XGBClassifier(**param_xgb, n_jobs=-1)
  
  cv_score = cross_val_score(clf, X, y, scoring=f1, n_jobs=-1, verbose=5, cv=skf)
  print(f'CV score : {cv_score}')
  print(f'CV mean : {cv_score.mean()}')
#   train_score = f1_score(cross_val_predict(clf, X, y, cv=skf), y, average='weighted')
#   print(f'Train Score : {train_score}') 

In [0]:
param_xgb = {'learning_rate': 0.06610098295419149,
 'max_depth': 36,
 'n_estimators': 283,
 'subsample': 0.32264159755217825}

In [0]:
train_xgb(X_train, y)

In [0]:
clf = XGBClassifier(**param_xgb, objective='multi:softmax', n_jobs=-1)
%time clf.fit(X_train, y)

CPU times: user 29min 53s, sys: 1.47 s, total: 29min 55s
Wall time: 15min 9s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       learning_rate=0.06610098295419149, max_delta_step=0, max_depth=36,
       min_child_weight=1, missing=None, n_estimators=283, n_jobs=-1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.32264159755217825, verbosity=1)

In [0]:
# preds = clf.predict(X_test)
write_submission_file(preds, 'XGB_tuned.csv')
# LB : 0.89151

In [0]:
proba_xgb = clf.predict_proba(X_test)

In [0]:
train_logit(X_train, y, X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.8min finished


CV score : [0.87447933 0.8727062  0.87380148 0.868889   0.86585168 0.88189711
 0.86651208 0.88323933 0.87735641 0.87650097]
CV mean : 0.874123357969245
Train Score : 0.9079833055281712


In [0]:
train_lgb(X_train, y, X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.6min finished


CV score : [0.88833257 0.88166676 0.8890405  0.87272716 0.88637954 0.88683627
 0.87754947 0.88902222 0.88341457 0.88681396]
CV mean : 0.8841783016647383
Train Score : 0.9054320382259826


In [0]:
lgb_submission(X_train, y, X_test, 'TextwithFeatures1.csv')

In [0]:
train_xgb(X_train, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total= 1.3min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV] ................................................. , total= 1.3min
[CV]  ................................................................
[CV] ................................................. , total= 1.4min
[CV]  ................................................................
[CV] ................................................. , total= 1.3min
[CV]  ................................................................
[CV] ................................................. , total= 1.4min
CV score : [0.87213373 0.87334636 0.8708427  0.87130875 0.87117021]
CV mean : 0.8717603477680713


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.6min finished


Train Score : 0.9062553154290376


In [0]:
clf = BaggingClassifier(lgb)
cv_score = cross_val_score(clf, X_train, y, scoring=f1, n_jobs=-1, verbose=2, cv=skf)
print(f'CV score : {cv_score}')
print(f'CV mean : {cv_score.mean()}')
train_score = f1_score(cross_val_predict(clf, X_train, y, cv=skf), y, average='weighted')
print(f'Train Score : {train_score}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 62.5min finished


CV score : [0.88121538 0.89182372 0.88306223 0.87607335 0.87959142 0.87499966
 0.88567183 0.88733508 0.88036139 0.88249595]
CV mean : 0.8822630000724819
Train Score : 0.9126848248626827


In [0]:
lgb = LGBMClassifier()
clf = BaggingClassifier(lgb, n_estimators=17)
%time clf.fit(X_train, y)
preds = clf.predict(X_test)
write_submission_file(preds, 'Bagging_LGB17.csv')
#0.88937

CPU times: user 23min 56s, sys: 5.97 s, total: 24min 2s
Wall time: 12min 4s


In [0]:
param_lgb = {'learning_rate': 0.06610098295419149,
 'max_depth': 15,
 'n_estimators': 281,
 'num_leaves': 17,
 'subsample': 0.6734464687862453}

In [0]:
clf_lgb = LGBMClassifier(**param_lgb)
%time clf_lgb.fit(X_train, y)

CPU times: user 2min 15s, sys: 551 ms, total: 2min 16s
Wall time: 1min 9s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.06610098295419149,
        max_depth=15, min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=281, n_jobs=-1, num_leaves=17,
        objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=0.6734464687862453,
        subsample_for_bin=200000, subsample_freq=0)

In [0]:
preds = clf_lgb.predict(X_test)

In [0]:
write_submission_file(preds, 'LGB_tuned.csv')
# LB 0.89381

In [0]:
proba_lgb = clf_lgb.predict_proba(X_test)

In [0]:
proba_ = proba_lgb*0.6 + proba_xgb*0.4

In [0]:
proba_ = np.argmax(proba_, axis=1)

In [0]:
write_submission_file(proba_, 'LGB6_XGB4_tuned.csv')
# LB 0.89514