# About this notebook

- Feature engineering: 40 features created, most of them using Textstat and Spacy
- XGB train
- Feature importance analysis
- Submission

# Imports

In [None]:
import numpy as np
import pandas as pd

import sys
from sklearn import preprocessing
import collections
from tqdm import tqdm

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

import warnings
warnings.filterwarnings('ignore')

In [None]:
from __future__ import unicode_literals, print_function
from spacy.lang.en import English
import spacy

In [None]:
# needs internet or package as input data
# pip install textstat

In [None]:
# not using internet

# Access to textstat files
sys.path.append("../input/textstat")
sys.path.append("../input/textstat/textstat-master")

# Access to pyphen files
sys.path.append("../input/pyphen")
sys.path.append("../input/pyphen/Pyphen-master")

In [None]:
import pyphen
import textstat

# Train data

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train_df.head()

In [None]:
train_original = train_df[['excerpt','target']]
train_original

# Test data

In [None]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df

# Feature engineering

[Textstat package](https://pypi.org/project/textstat/)

In [None]:
train_original.shape

In [None]:
%%time
def feat_eng(df):
    df = df.copy() # .head(3) # head for testing/debugging
    
    df['syllable_count'] = [textstat.syllable_count(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['lexicon_count'] = [textstat.lexicon_count(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['sentence_count'] = [textstat.sentence_count(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['flesch_reading_ease'] = [textstat.flesch_reading_ease(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['flesch_kincaid_grade'] = [textstat.flesch_kincaid_grade(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['gunning_fog'] = [textstat.gunning_fog(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['smog_index'] = [textstat.smog_index(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['automated_readability_index'] = [textstat.automated_readability_index(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['coleman_liau_index'] = [textstat.coleman_liau_index(df.excerpt.iloc[i]) for i in range(0, len(df))]
    
    df['linsear_write_formula'] = [textstat.linsear_write_formula(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['linsear_write_formula'] = round(df['linsear_write_formula'], 3)
    
    df['dale_chall_readability_score'] = [textstat.dale_chall_readability_score(df.excerpt.iloc[i]) for i in range(0, len(df))]
    df['crawford'] = [textstat.crawford(df.excerpt.iloc[i]) for i in range(0, len(df))]
    
    df['text_standard'] = [textstat.text_standard(df.excerpt.iloc[i]) for i in range(0, len(df))]
    le = preprocessing.LabelEncoder()
    df['text_standard'] = le.fit_transform(df['text_standard']) # categorical feature
    
    df['word_count'] = [len(df.excerpt.iloc[i].split(' ')) for i in range(0, len(df))]
    
    for row in tqdm(range(0, len(df))):
        nlp = English()
        complicated_signals = nlp(':;-&')
        
        full_text = []
        full_text = df.excerpt.iloc[row] # or full_text = df.loc[row, 'excerpt']
        doc = nlp(full_text)
        words_ns = [token.text for token in doc if not token.is_stop and not token.is_punct] # not stopwords & not punct
        df.loc[row, 'word_count_ns'] = len(words_ns)
        
        punct_count = [token.text for token in doc if token.is_punct] 
        df.loc[row, 'punctuation_count'] = len(punct_count)
        
        complicated = [token.text for token in doc if token in complicated_signals]
        df.loc[row, 'complicated_signals'] = len(complicated)
        
        df.loc[row, 'vocab_len'] = len(doc.vocab)
        
        
        # POS: Parts of Speech
        all_tags_in_a_row = []
        nlp = spacy.load('en_core_web_sm') # load model
        doc = nlp(df.loc[row, 'excerpt'])
        all_tags_in_a_row.append([token.pos_ for token in doc]) # list with tags (POS)
        row_dict = collections.Counter(all_tags_in_a_row[0]) # Counter object
#         print(row_dict)

        # create columns accessing Counter object
        df.loc[row, 'n_ADJ'] = row_dict['ADJ']
        df.loc[row, 'n_ADP'] = row_dict['ADP']
        df.loc[row, 'n_ADV'] = row_dict['ADV']
        df.loc[row, 'n_AUX'] = row_dict['AUX']
        df.loc[row, 'n_CCONJ'] = row_dict['CCONJ']
        df.loc[row, 'n_DET'] = row_dict['DET']
        df.loc[row, 'n_INTJ'] = row_dict['INTJ']
        df.loc[row, 'n_NOUN'] = row_dict['NOUN']
        df.loc[row, 'n_NUM'] = row_dict['NUM']
        df.loc[row, 'n_PART'] = row_dict['PART']
        df.loc[row, 'n_PRON'] = row_dict['PRON']
        df.loc[row, 'n_PROPN'] = row_dict['PROPN']
#         df.loc[row, 'n_PUNCT'] = row_dict['PUNCT'] # same as 'punctuation_count' column
        df.loc[row, 'n_SCONJ'] = row_dict['SCONJ']
        df.loc[row, 'n_VERB'] = row_dict['VERB']
        
        
        # sentences
        nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated
        sentences = [sent.string.strip() for sent in doc.sents]
        df.loc[row, 'n_sentences'] = len(sentences)
        
        df.loc[row, 'avg_words_per_sentence'] = df.loc[row, 'word_count']/len(sentences)
        df['avg_words_per_sentence'] = round(df['avg_words_per_sentence'], 2)
        df.loc[row, 'avg_words_ns_per_sentence'] = df.loc[row, 'word_count_ns']/len(sentences)
        df['avg_words_ns_per_sentence'] = round(df['avg_words_ns_per_sentence'], 2)
        
        
        for sentence in sentences:
            flesch_list = []
            flesch_list.append(textstat.flesch_reading_ease(sentence))
            df.loc[row, 'max_flesch_per_sentence'] = max(flesch_list)
            df.loc[row, 'min_flesch_per_sentence'] = min(flesch_list)
            
            flesch_kincaid_list = []
            flesch_kincaid_list.append(textstat.flesch_kincaid_grade(sentence))
            df.loc[row, 'max_flesch_kincaid_per_sentence'] = max(flesch_kincaid_list)
            df.loc[row, 'min_flesch_kincaid_per_sentence'] = min(flesch_kincaid_list)            
            
        

    
    df['percentage_stopwords'] = round(100*(df['word_count'] - df['word_count_ns'])/df['word_count'], 2)
    


    ### add more features?
    # percentage of verbs, nouns...?
    # max ADJ per sentence...
    # avg, max, min words
    # max/min flesch (ok)
    # max/min other textstat features    
    
    return df
    

#### Uncomment these 3 lines below to generate the features
# train = feat_eng(train_original)
# train.to_csv('./train_features.csv', index=False) # export train features to csv file
# train.shape

#### ~39 min to generate train features

In [None]:
# reading train features created in previous run (saves ~40 min)
train = pd.read_csv('../input/train-features-40/train_features_40.csv')

In [None]:
train.columns

In [None]:
train
# train[['min_flesch_per_sentence', 'min_flesch_kincaid_per_sentence']]
# train[['punctuation_count', 'n_PUNCT']]

In [None]:
features = [i for i in train.columns if i not in ['excerpt', 'target']]
len(features)

In [None]:
train.groupby(['text_standard'])['target'].count() # imbalance?? still need to check these categories...

# XGB Regressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train[features], train['target'], test_size=0.1, random_state=42)

In [None]:
## check data types
# train.info()

In [None]:
%%time
xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror', random_state = 42)

xgb_reg.fit(X_train, y_train)

preds = xgb_reg.predict(X_test)

rmse = np.sqrt(mse(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
%%time
xgb_reg.fit(train[features], train['target'])

test = feat_eng(test_df)
test_pred = xgb_reg.predict(test[features])
test_pred

In [None]:
test.shape, test_pred.shape

In [None]:
from xgboost import plot_importance
from matplotlib import pyplot as plt

plt.rcParams["figure.figsize"] = (22, 16)
plot_importance(xgb_reg)
plt.show()

In [None]:
xgb_reg.feature_importances_

# Submission

In [None]:
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = test_pred
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions