In [None]:
import pandas as pd
import numpy as np
import re

train=pd.read_csv('train.csv', encoding='utf-8')
test=pd.read_csv('test_x.csv', encoding='utf-8')

In [None]:
!pip install textstat
!pip install fasttext

from textstat import flesch_reading_ease
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics, model_selection, naive_bayes
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import word_tokenize, pos_tag, ne_chunk, tree2conlltags
import fasttext
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('maxent_ne_chunker')
nltk.download('words')

eng_stopwords = set(stopwords.words("english"))
symbols_knowns = string.ascii_letters + string.digits + string.punctuation

In [None]:
def sentiment_nltk(text):
    res = SentimentIntensityAnalyzer().polarity_scores(text)
    return res['compound']

def get_words(text):
    words = nltk.tokenize.word_tokenize(text)
    return [word for word in words if not word in string.punctuation]
    
def count_tokens(text, tokens):
    return sum([w in tokens for w in get_words(text)])

def first_word_len(text):
    if(len(get_words(text))==0):
        return 0
    else:   
        return len(get_words(text)[0])

def last_word_len(text):
    if(len(get_words(text))==0):
        return 0
    else:   
        return len(get_words(text)[-1])

def symbol_id(x):
    symbols=[x for x in symbols_knowns]
      
    if x not in symbols:
        return -1 
    else:
        return np.where(np.array(symbols) == x )[0][0]

In [None]:
def fraction_noun(text):
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    word_count = text_splited.__len__()
    if word_count==0:
        return 0
    else:
        pos_list = nltk.pos_tag(text_splited)
        noun_count = len([w for w in pos_list if w[1] in ('NN','NNP','NNPS','NNS')])
    
        return (noun_count/word_count)

def fraction_adj(text):
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    word_count = text_splited.__len__()
    if word_count==0:
        return 0
    else:
        pos_list = nltk.pos_tag(text_splited)
        adj_count = len([w for w in pos_list if w[1] in ('JJ','JJR','JJS')])
    
        return (adj_count/word_count)  

def fraction_verbs(text):
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    word_count = text_splited.__len__()
    if word_count==0:
        return 0
    else:
        pos_list = nltk.pos_tag(text_splited)
        verbs_count = len([w for w in pos_list if w[1] in ('VB','VBD','VBG','VBN','VBP','VBZ')])
    
        return (verbs_count/word_count)  

In [None]:
train['num_words']=train['text'].apply(lambda x:len(get_words(x)))
train['mean_word_len']=train['text'].apply(lambda x:np.mean([len(w) for w in str(x).split()]))
train["num_unique_words"] = train["text"].apply(lambda x: len(set(str(x).split())))
train["num_chars"] = train["text"].apply(lambda x: len(str(x)))
train["num_stopwords"] = train["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
train["num_punctuations"] =train['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
train["num_words_upper"] = train["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))/train["num_words"]
train["num_words_title"] = train["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))/train["num_words"]
train["chars_between_comma"] = train["text"].apply(lambda x: np.mean([len(chunk) for chunk in str(x).split(",")]))/train["num_chars"]
train["symbols_unknowns"]=train["text"].apply(lambda x: np.sum([not w in symbols_knowns for w in str(x)]))/train["num_chars"]
train['noun'] = train["text"].apply(lambda x: fraction_noun(x))
train['adj'] = train["text"].apply(lambda x: fraction_adj(x))
train['verbs'] = train["text"].apply(lambda x: fraction_verbs(x))
train["sentiment"]=train["text"].apply(sentiment_nltk)
train['single_frac'] = train['text'].apply(lambda x: count_tokens(x, ['is', 'was', 'has', 'he', 'she', 'it', 'her', 'his']))/train["num_words"]
train['plural_frac'] = train['text'].apply(lambda x: count_tokens(x, ['are', 'were', 'have', 'we', 'they']))/train["num_words"]
train['first_word_len']=train['text'].apply(first_word_len)/train["num_chars"]
train['last_word_len']=train['text'].apply(last_word_len)/train["num_chars"]
train["first_word_id"] = train['text'].apply(lambda x: symbol_id(list(x.strip())[0]))
train["last_word_id"] = train['text'].apply(lambda x: symbol_id(list(x.strip())[-1]))
train['ease']=train['text'].apply(flesch_reading_ease)


test['num_words']=test['text'].apply(lambda x:len(str(x).split()))
test['mean_word_len']=test['text'].apply(lambda x:np.mean([len(w) for w in str(x).split()]))
test["num_unique_words"] = test["text"].apply(lambda x: len(set(str(x).split())))
test["num_chars"] = test["text"].apply(lambda x: len(str(x)))
test["num_stopwords"] = test["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
test["num_punctuations"] =test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test["num_words_upper"] = test["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))/test["num_words"]
test["num_words_title"] = test["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))/test["num_words"]
test["chars_between_comma"] = test["text"].apply(lambda x: np.mean([len(chunk) for chunk in str(x).split(",")]))/test["num_chars"]
test["symbols_unknowns"]=test["text"].apply(lambda x: np.sum([not w in symbols_knowns for w in str(x)]))/test["num_chars"]
test['noun'] = test["text"].apply(lambda x: fraction_noun(x))
test['adj'] = test["text"].apply(lambda x: fraction_adj(x))
test['verbs'] = test["text"].apply(lambda x: fraction_verbs(x))
test["sentiment"]=test["text"].apply(sentiment_nltk)
test['single_frac'] = test['text'].apply(lambda x: count_tokens(x, ['is', 'was', 'has', 'he', 'she', 'it', 'her', 'his']))/test["num_words"]
test['plural_frac'] = test['text'].apply(lambda x: count_tokens(x, ['are', 'were', 'have', 'we', 'they']))/test["num_words"]
test['first_word_len']=test['text'].apply(first_word_len)/test["num_chars"]
test['last_word_len']=test['text'].apply(last_word_len)/test["num_chars"]
test["first_word_id"] = test['text'].apply(lambda x: symbol_id(list(x.strip())[0]))
test["last_word_id"] = test['text'].apply(lambda x: symbol_id(list(x.strip())[-1]))
test['ease']=test['text'].apply(flesch_reading_ease)


In [None]:
def get_persons(text):
    def bind_names(tagged_words):
        names=list()
        name=list()
        for i,w in enumerate(tagged_words):    
            if("PERSON" in w[2]):
                name.append(w[0])    
            else:
                if(len(name)!=0):
                    names.append(" ".join(name))
                name=list()
                
            if(i==len(tagged_words)-1 and len(name)!=0):
                names.append(" ".join(name))
        return names                   

    res_ne_tree = ne_chunk(pos_tag(word_tokenize(text)))
    res_ne = tree2conlltags(res_ne_tree)
    res_ne_list = [list(x) for x in res_ne]      
    return bind_names(res_ne_list)               


text_author_0 = " ".join(list(train['text'][train['author']==0]))
text_author_1 = " ".join(list(train['text'][train['author']==1]))
text_author_2 = " ".join(list(train['text'][train['author']==2]))
text_author_3 = " ".join(list(train['text'][train['author']==3]))
text_author_4 = " ".join(list(train['text'][train['author']==4]))

persons_author_0 = set(get_persons(text_author_0))
persons_author_1 = set(get_persons(text_author_1))
persons_author_2 = set(get_persons(text_author_2))
persons_author_3 = set(get_persons(text_author_3))
persons_author_4 = set(get_persons(text_author_4))

def jaccard(a,b):
    return len(a&b)/len(a|b)

train["persons_0"]=train["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_0)) 
train["persons_1"]=train["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_1)) 
train["persons_2"]=train["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_2)) 
train["persons_3"]=train["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_3)) 
train["persons_4"]=train["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_4)) 

test["persons_0"]=test["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_0)) 
test["persons_1"]=test["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_1)) 
test["persons_2"]=test["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_2)) 
test["persons_3"]=test["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_3)) 
test["persons_4"]=test["text"].apply(lambda x:jaccard(set(get_persons(x)),persons_author_4)) 

In [None]:
train['text'].to_csv('sample_file.txt',index=False, header=None, sep="\t")
model_ft = fasttext.train_unsupervised('sample_file.txt', minCount=2, minn=2, maxn=10,dim=300)

def sent2vec(s):
    words = nltk.tokenize.word_tokenize(s)
    #words = [k.stem(w) for w in words]
    #words = [w for w in words if not w in string.digits]
    #words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model_ft[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v

xtrain_ft = np.array([sent2vec(x) for x in train['text']])
xtest_ft = np.array([sent2vec(x) for x in test['text']])

train_ft=pd.DataFrame(xtrain_ft)
train_ft.columns = ['ft_vector_'+str(i) for i in range(xtrain_ft.shape[1])]

test_ft=pd.DataFrame(xtest_ft)
test_ft.columns = ['ft_vector_'+str(i) for i in range(xtrain_ft.shape[1])]

train = pd.concat([train, train_ft], axis=1)
test = pd.concat([test, test_ft], axis=1)

In [None]:
tfidf_vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
train_tfidf = tfidf_vec.fit_transform(train['text'].values.tolist())
test_tfidf = tfidf_vec.transform(test['text'].values.tolist())
train_y = train['author']

def runLR(train_X,train_y,test_X,test_y,test_X2):
    model=LogisticRegression()
    model.fit(train_X,train_y)
    pred_test_y=model.predict_proba(test_X)
    pred_test_y2=model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model


cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runLR(dev_X, dev_y, val_X, val_y,test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["tfidf_LR_0"] = pred_train[:,0]
train["tfidf_LR_1"] = pred_train[:,1]
train["tfidf_LR_2"] = pred_train[:,2]
train["tfidf_LR_3"] = pred_train[:,3]
train["tfidf_LR_4"] = pred_train[:,4]
test["tfidf_LR_0"] = pred_full_test[:,0]
test["tfidf_LR_1"] = pred_full_test[:,1]
test["tfidf_LR_2"] = pred_full_test[:,2]
test["tfidf_LR_3"] = pred_full_test[:,3]
test["tfidf_LR_4"] = pred_full_test[:,4]


cvec_vec=CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
cvec_vec.fit(train['text'].values.tolist())
train_cvec = cvec_vec.transform(train['text'].values.tolist())
test_cvec = cvec_vec.transform(test['text'].values.tolist())

cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_cvec[dev_index], train_cvec[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runLR(dev_X, dev_y, val_X, val_y,test_cvec)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["cvec_LR_0"] = pred_train[:,0]
train["cvec_LR_1"] = pred_train[:,1]
train["cvec_LR_2"] = pred_train[:,2]
train["cvec_LR_3"] = pred_train[:,3]
train["cvec_LR_4"] = pred_train[:,4]
test["cvec_LR_0"] = pred_full_test[:,0]
test["cvec_LR_1"] = pred_full_test[:,1]
test["cvec_LR_2"] = pred_full_test[:,2]
test["cvec_LR_3"] = pred_full_test[:,3]
test["cvec_LR_4"] = pred_full_test[:,4]


cvec_char_vec = CountVectorizer(ngram_range=(1,7), analyzer='char')
cvec_char_vec.fit(train['text'].values.tolist())
train_cvec_char = cvec_char_vec.transform(train['text'].values.tolist())
test_cvec_char = cvec_char_vec.transform(test['text'].values.tolist())

cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_cvec_char[dev_index], train_cvec_char[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runLR(dev_X, dev_y, val_X, val_y,test_cvec_char)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["cvec_char_LR_0"] = pred_train[:,0]
train["cvec_char_LR_1"] = pred_train[:,1]
train["cvec_char_LR_2"] = pred_train[:,2]
train["cvec_char_LR_3"] = pred_train[:,3]
train["cvec_char_LR_4"] = pred_train[:,4]
test["cvec_char_LR_0"] = pred_full_test[:,0]
test["cvec_char_LR_1"] = pred_full_test[:,1]
test["cvec_char_LR_2"] = pred_full_test[:,2]
test["cvec_char_LR_3"] = pred_full_test[:,3]
test["cvec_char_LR_4"] = pred_full_test[:,4]

In [None]:
tfidf_vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)

train_tfidf = tfidf_vec.fit_transform(train['text'].values.tolist())
test_tfidf = tfidf_vec.transform(test['text'].values.tolist())
train_y = train['author']

def runSGD(train_X,train_y,test_X,test_y,test_X2):
    model=SGDClassifier(loss='log')
    model.fit(train_X,train_y)
    pred_test_y=model.predict_proba(test_X)
    pred_test_y2=model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runSGD(dev_X, dev_y, val_X, val_y,test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["tfidf_SGD_0"] = pred_train[:,0]
train["tfidf_SGD_1"] = pred_train[:,1]
train["tfidf_SGD_2"] = pred_train[:,2]
train["tfidf_SGD_3"] = pred_train[:,3]
train["tfidf_SGD_4"] = pred_train[:,4]
test["tfidf_SGD_0"] = pred_full_test[:,0]
test["tfidf_SGD_1"] = pred_full_test[:,1]
test["tfidf_SGD_2"] = pred_full_test[:,2]
test["tfidf_SGD_3"] = pred_full_test[:,3]
test["tfidf_SGD_4"] = pred_full_test[:,4]


cvec_char_vec = CountVectorizer(ngram_range=(1,7), analyzer='char')
cvec_char_vec.fit(train['text'].values.tolist())
train_cvec_char = cvec_char_vec.transform(train['text'].values.tolist())
test_cvec_char = cvec_char_vec.transform(test['text'].values.tolist())

cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_cvec_char[dev_index], train_cvec_char[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runSGD(dev_X, dev_y, val_X, val_y,test_cvec_char)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["cvec_char_SGD_0"] = pred_train[:,0]
train["cvec_char_SGD_1"] = pred_train[:,1]
train["cvec_char_SGD_2"] = pred_train[:,2]
train["cvec_char_SGD_3"] = pred_train[:,3]
train["cvec_char_SGD_4"] = pred_train[:,4]
test["cvec_char_SGD_0"] = pred_full_test[:,0]
test["cvec_char_SGD_1"] = pred_full_test[:,1]
test["cvec_char_SGD_2"] = pred_full_test[:,2]
test["cvec_char_SGD_3"] = pred_full_test[:,3]
test["cvec_char_SGD_4"] = pred_full_test[:,4]


In [None]:
tfidf_vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)

train_tfidf = tfidf_vec.fit_transform(train['text'].values.tolist())
test_tfidf = tfidf_vec.transform(test['text'].values.tolist())
train_y = train['author']

def runRF(train_X,train_y,test_X,test_y,test_X2):
    model=RandomForestClassifier()
    model.fit(train_X,train_y)
    pred_test_y=model.predict_proba(test_X)
    pred_test_y2=model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model


cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runRF(dev_X, dev_y, val_X, val_y,test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["tfidf_RF_0"] = pred_train[:,0]
train["tfidf_RF_1"] = pred_train[:,1]
train["tfidf_RF_2"] = pred_train[:,2]
train["tfidf_RF_3"] = pred_train[:,3]
train["tfidf_RF_4"] = pred_train[:,4]
test["tfidf_RF_0"] = pred_full_test[:,0]
test["tfidf_RF_1"] = pred_full_test[:,1]
test["tfidf_RF_2"] = pred_full_test[:,2]
test["tfidf_RF_3"] = pred_full_test[:,3]
test["tfidf_RF_4"] = pred_full_test[:,4]


cvec_char_vec = CountVectorizer(ngram_range=(1,7), analyzer='char')
cvec_char_vec.fit(train['text'].values.tolist())
train_cvec_char = cvec_char_vec.transform(train['text'].values.tolist())
test_cvec_char = cvec_char_vec.transform(test['text'].values.tolist())

cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_cvec_char[dev_index], train_cvec_char[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runRF(dev_X, dev_y, val_X, val_y,test_cvec_char)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["cvec_char_RF_0"] = pred_train[:,0]
train["cvec_char_RF_1"] = pred_train[:,1]
train["cvec_char_RF_2"] = pred_train[:,2]
train["cvec_char_RF_3"] = pred_train[:,3]
train["cvec_char_RF_4"] = pred_train[:,4]
test["cvec_char_RF_0"] = pred_full_test[:,0]
test["cvec_char_RF_1"] = pred_full_test[:,1]
test["cvec_char_RF_2"] = pred_full_test[:,2]
test["cvec_char_RF_3"] = pred_full_test[:,3]
test["cvec_char_RF_4"] = pred_full_test[:,4]

In [None]:
tfidf_vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)

train_tfidf = tfidf_vec.fit_transform(train['text'].values.tolist())
test_tfidf = tfidf_vec.transform(test['text'].values.tolist())
train_y = train['author']

def runMLP(train_X,train_y,test_X,test_y,test_X2):
    model=MLPClassifier()
    model.fit(train_X,train_y)
    pred_test_y=model.predict_proba(test_X)
    pred_test_y2=model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model


cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runMLP(dev_X, dev_y, val_X, val_y,test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["tfidf_MLP_0"] = pred_train[:,0]
train["tfidf_MLP_1"] = pred_train[:,1]
train["tfidf_MLP_2"] = pred_train[:,2]
train["tfidf_MLP_3"] = pred_train[:,3]
train["tfidf_MLP_4"] = pred_train[:,4]
test["tfidf_MLP_0"] = pred_full_test[:,0]
test["tfidf_MLP_1"] = pred_full_test[:,1]
test["tfidf_MLP_2"] = pred_full_test[:,2]
test["tfidf_MLP_3"] = pred_full_test[:,3]
test["tfidf_MLP_4"] = pred_full_test[:,4]


cvec_char_vec = CountVectorizer(ngram_range=(1,5), analyzer='char')
cvec_char_vec.fit(train['text'].values.tolist())
train_cvec_char = cvec_char_vec.transform(train['text'].values.tolist())
test_cvec_char = cvec_char_vec.transform(test['text'].values.tolist())
train_y = train['author']

cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_cvec_char[dev_index], train_cvec_char[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runMLP(dev_X, dev_y, val_X, val_y,test_cvec_char)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["cvec_char_MLP_0"] = pred_train[:,0]
train["cvec_char_MLP_1"] = pred_train[:,1]
train["cvec_char_MLP_2"] = pred_train[:,2]
train["cvec_char_MLP_3"] = pred_train[:,3]
train["cvec_char_MLP_4"] = pred_train[:,4]
test["cvec_char_MLP_0"] = pred_full_test[:,0]
test["cvec_char_MLP_1"] = pred_full_test[:,1]
test["cvec_char_MLP_2"] = pred_full_test[:,2]
test["cvec_char_MLP_3"] = pred_full_test[:,3]
test["cvec_char_MLP_4"] = pred_full_test[:,4]


In [None]:
tfidf_vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
train_tfidf = tfidf_vec.fit_transform(train['text'].values.tolist())
test_tfidf = tfidf_vec.transform(test['text'].values.tolist())
train_y = train['author']

def runDT(train_X,train_y,test_X,test_y,test_X2):
    model=DecisionTreeClassifier()
    model.fit(train_X,train_y)
    pred_test_y=model.predict_proba(test_X)
    pred_test_y2=model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)


for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runDT(dev_X, dev_y, val_X, val_y,test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["tfidf_DT_0"] = pred_train[:,0]
train["tfidf_DT_1"] = pred_train[:,1]
train["tfidf_DT_2"] = pred_train[:,2]
train["tfidf_DT_3"] = pred_train[:,3]
train["tfidf_DT_4"] = pred_train[:,4]
test["tfidf_DT_0"] = pred_full_test[:,0]
test["tfidf_DT_1"] = pred_full_test[:,1]
test["tfidf_DT_2"] = pred_full_test[:,2]
test["tfidf_DT_3"] = pred_full_test[:,3]
test["tfidf_DT_4"] = pred_full_test[:,4]


cvec_char_vec = CountVectorizer(ngram_range=(1,7), analyzer='char')
cvec_char_vec.fit(train['text'].values.tolist())
train_cvec_char = cvec_char_vec.transform(train['text'].values.tolist())
test_cvec_char = cvec_char_vec.transform(test['text'].values.tolist())
train_y = train['author']

cv_scores=[]
cols_to_drop=['text','index']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_X = test.drop(cols_to_drop, axis=1)
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in cv.split(train_X,train_y):
    dev_X, val_X = train_cvec_char[dev_index], train_cvec_char[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runDT(dev_X, dev_y, val_X, val_y,test_cvec_char)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["cvec_char_DT_0"] = pred_train[:,0]
train["cvec_char_DT_1"] = pred_train[:,1]
train["cvec_char_DT_2"] = pred_train[:,2]
train["cvec_char_DT_3"] = pred_train[:,3]
train["cvec_char_DT_4"] = pred_train[:,4]
test["cvec_char_DT_0"] = pred_full_test[:,0]
test["cvec_char_DT_1"] = pred_full_test[:,1]
test["cvec_char_DT_2"] = pred_full_test[:,2]
test["cvec_char_DT_3"] = pred_full_test[:,3]
test["cvec_char_DT_4"] = pred_full_test[:,4]

In [None]:
tfidf_vec=TfidfVectorizer(stop_words='english',ngram_range=(1,3))
train_tfidf= tfidf_vec.fit_transform(train['text'].values.tolist())
test_tfidf = tfidf_vec.transform(test['text'].values.tolist())

n_comp = 20
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(train_tfidf)

train_svd = svd_obj.transform(train_tfidf)
test_svd = svd_obj.transform(test_tfidf)

from sklearn import preprocessing
scl = preprocessing.StandardScaler()
scl.fit(train_svd)
train_svd_scl = pd.DataFrame(scl.transform(train_svd))
test_svd_scl = pd.DataFrame(scl.transform(test_svd))

train_svd_scl.columns = ['svd_word_'+str(i) for i in range(n_comp)]
test_svd_scl.columns = ['svd_word_'+str(i) for i in range(n_comp)]
train = pd.concat([train, train_svd_scl], axis=1)
test = pd.concat([test, test_svd_scl], axis=1)

In [None]:
def runMNB(train_X,train_y,test_X,test_y,test_X2):
    model=naive_bayes.MultinomialNB()
    model.fit(train_X,train_y)
    pred_test_y=model.predict_proba(test_X)
    pred_test_y2=model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

Count_vec=CountVectorizer(stop_words='english',ngram_range=(1,3))

Count_vec.fit(train['text'].values.tolist())
train_Count = Count_vec.transform(train['text'].values.tolist())
test_Count = Count_vec.transform(test['text'].values.tolist())

cv_scores=[]
pred_train=np.zeros([train.shape[0],5])
pred_full_test = 0

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_Count[dev_index], train_Count[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y,test_Count)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["nb_cvec_0"] = pred_train[:,0]
train["nb_cvec_1"] = pred_train[:,1]
train["nb_cvec_2"] = pred_train[:,2]
train["nb_cvec_3"] = pred_train[:,3]
train["nb_cvec_4"] = pred_train[:,4]
test["nb_cvec_0"] = pred_full_test[:,0]
test["nb_cvec_1"] = pred_full_test[:,1]
test["nb_cvec_2"] = pred_full_test[:,2]
test["nb_cvec_3"] = pred_full_test[:,3]
test["nb_cvec_4"] = pred_full_test[:,4]

In [None]:
cvec_char_vec = CountVectorizer(ngram_range=(1,7), analyzer='char')
cvec_char_vec.fit(train['text'].values.tolist())
train_cvec_char = cvec_char_vec.transform(train['text'].values.tolist())
test_cvec_char = cvec_char_vec.transform(test['text'].values.tolist())

cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2020)
for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_cvec_char[dev_index], train_cvec_char[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_cvec_char)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.

train["nb_cvec_char_0"] = pred_train[:,0]
train["nb_cvec_char_1"] = pred_train[:,1]
train["nb_cvec_char_2"] = pred_train[:,2]
train["nb_cvec_char_3"] = pred_train[:,3]
train["nb_cvec_char_4"] = pred_train[:,4]
test["nb_cvec_char_0"] = pred_full_test[:,0]
test["nb_cvec_char_1"] = pred_full_test[:,1]
test["nb_cvec_char_2"] = pred_full_test[:,2]
test["nb_cvec_char_3"] = pred_full_test[:,3]
test["nb_cvec_char_4"] = pred_full_test[:,4]

In [None]:
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), analyzer='char')

train_tfidf = tfidf_vec.fit_transform(train['text'].values.tolist())
test_tfidf = tfidf_vec.transform(test['text'].values.tolist())

cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2020)
for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5.


train["nb_tfidf_char_0"] = pred_train[:,0]
train["nb_tfidf_char_1"] = pred_train[:,1]
train["nb_tfidf_char_2"] = pred_train[:,2]
train["nb_tfidf_char_3"] = pred_train[:,3]
train["nb_tfidf_char_4"] = pred_train[:,4]
test["nb_tfidf_char_0"] = pred_full_test[:,0]
test["nb_tfidf_char_1"] = pred_full_test[:,1]
test["nb_tfidf_char_2"] = pred_full_test[:,2]
test["nb_tfidf_char_3"] = pred_full_test[:,3]
test["nb_tfidf_char_4"] = pred_full_test[:,4]

In [None]:
n_comp = 20
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(train_tfidf)

train_svd = svd_obj.transform(train_tfidf)
test_svd = svd_obj.transform(test_tfidf)

from sklearn import preprocessing
scl = preprocessing.StandardScaler()
scl.fit(train_svd)
train_svd_scl = pd.DataFrame(scl.transform(train_svd))
test_svd_scl = pd.DataFrame(scl.transform(test_svd))

train_svd_scl.columns = ['svd_char_'+str(i) for i in range(n_comp)]
test_svd_scl.columns = ['svd_char_'+str(i) for i in range(n_comp)]
train = pd.concat([train, train_svd_scl], axis=1)
test = pd.concat([test, test_svd_scl], axis=1)

In [None]:
cols_to_drop = ['index', 'text']
train_X = train.drop(cols_to_drop+['author'], axis=1)
train_y=train['author']
test_index = test['index'].values
test_X = test.drop(cols_to_drop, axis=1)
xgb_preds=[]
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2020)

for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_X.loc[dev_index], train_X.loc[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    dtrain = xgb.DMatrix(dev_X,label=dev_y)
    dvalid = xgb.DMatrix(val_X, label=val_y)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 3
    param['silent'] = 1
    param['num_class'] = 5
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.8
    param['colsample_bytree'] = 0.3
    param['seed'] = 0
    param['tree_method'] = 'gpu_hist'

    model = xgb.train(param, dtrain, 2000, watchlist, early_stopping_rounds=50, verbose_eval=20)

    xgtest2 = xgb.DMatrix(test_X)
    xgb_pred = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    xgb_preds.append(list(xgb_pred))

#out_df = pd.DataFrame(pred_full_test)
#out_df.columns = ['0','1','2','3','4']
#out_df.insert(0, 'index', test_index)
#out_df.to_csv("submission.csv", index=False)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
xgb.plot_importance(model, max_num_features=80, height=0.8, ax=ax)
plt.show()

In [None]:
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(5):
        sum+=xgb_preds[j][i]    
    if(i==0):
        preds=sum/5
    else:
        preds=np.vstack([preds,sum/5])

preds=pd.DataFrame(preds)

preds.to_csv('submission.csv')