# INFO ABOUT COMPETITION
![books](https://cdn.pixabay.com/photo/2016/08/24/16/20/books-1617327_960_720.jpg)
<br>Image by Marisa Sias from Pixabay

### Goal 
The goal is to build algorithms to rate the complexity of reading passages for grade 3-12 classroom use. 

### Data
Files <br>
* train.csv - the training set
* test.csv - the test set
* sample_submission.csv - a sample submission file in the correct format<br>

Columns<br>
* id - unique ID for excerpt
* url_legal - URL of source - this is blank in the test set.
* license - license of source material - this is blank in the test set.
* excerpt - text to predict reading ease of
* target - reading ease
* standard_error - measure of spread of scores among multiple raters for each excerpt. Not included for test data.<br>

Note: 
Data includes excerpts from several time periods and a wide range of reading ease scores.<br>
Test set includes a slightly larger proportion of modern texts (the type of texts model should generalize to) than the training set.<br>
While licensing information is provided for the public test set, the hidden private test set includes only blank license/legal information.

### Evaluation
Submissions are scored on the RMSE - root mean squared error. <br>
* example of submission file:<br>
id,target<br>
eaf8e7355,0.0<br>
60ecc9777,0.5<br>
c0f722661,-2.0<br>
etc.<br>

# IMPORTS

In [None]:
# basic imports 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt # visualization
import seaborn as sns # visualization
sns.set()

In [None]:
# image manipulation for word cloud
from PIL import Image 
from wordcloud import WordCloud
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

In [None]:
# neural networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

In [None]:
# getting image from url
from io import BytesIO 
import requests

In [None]:
#  Vectorizer for text data - Counts and Tfidf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Importing re for text preprocessing
import re

In [None]:
# spacy for text preprocessing (lemmatization, tokenization, NER, POS)
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# Extreme Gradient Boosting Models
import xgboost as xgb

In [None]:
# Imports from sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# CONSTANTS

#### Paths to files with data and image for mask in wordcloud. Dictionary with NER and POS Tags for Spacy

In [None]:
# data paths
train_path = '../input/commonlitreadabilityprize/train.csv'
test_path = '../input/commonlitreadabilityprize/test.csv'

In [None]:
# images to use as mask in wordclouds generator
book_path = 'https://cdn.pixabay.com/photo/2013/04/01/21/30/book-99132_960_720.png'
book_path_2 = 'https://cdn.icon-icons.com/icons2/2622/PNG/512/book_icon_158035.png'

In [None]:
# Dictionary of POS Tags and NER Tags along with explanation used by spacy
GLOSSARY_POS = {
    # POS tags
    # Universal POS Tags
    # http://universaldependencies.org/u/pos/
    "ADJ": "adjective",
    "ADP": "adposition",
    "ADV": "adverb",
    "AUX": "auxiliary",
    "CONJ": "conjunction",
    "CCONJ": "coordinating conjunction",
    "DET": "determiner",
    "INTJ": "interjection",
    "NOUN": "noun",
    "NUM": "numeral",
    "PART": "particle",
    "PRON": "pronoun",
    "PROPN": "proper noun",
    "PUNCT": "punctuation",
    "SCONJ": "subordinating conjunction",
    "SYM": "symbol",
    "VERB": "verb",
    "X": "other",
    "EOL": "end of line",
    "SPACE": "space"}

GLOSSARY_NER = {
    # Named Entity Recognition
    # OntoNotes 5
    # https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
    "PERSON": "People, including fictional",
    "NORP": "Nationalities or religious or political groups",
    "FACILITY": "Buildings, airports, highways, bridges, etc.",
    "FAC": "Buildings, airports, highways, bridges, etc.",
    "ORG": "Companies, agencies, institutions, etc.",
    "GPE": "Countries, cities, states",
    "LOC": "Non-GPE locations, mountain ranges, bodies of water",
    "PRODUCT": "Objects, vehicles, foods, etc. (not services)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "WORK_OF_ART": "Titles of books, songs, etc.",
    "LAW": "Named documents made into laws.",
    "LANGUAGE": "Any named language",
    "DATE": "Absolute or relative dates or periods",
    "TIME": "Times smaller than a day",
    "PERCENT": 'Percentage, including "%"',
    "MONEY": "Monetary values, including unit",
    "QUANTITY": "Measurements, as of weight or distance",
    "ORDINAL": '"first", "second", etc.',
    "CARDINAL": "Numerals that do not fall under another type",
}

In [None]:
#dictionary with english contractions like don't isn't for function explanding contractions
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))


# FUNCTIONS

#### Preparing functions that will make text processing and data exploration easier

In [None]:
# Retrieving website address from url_legal column
def clean_link(link):
    """Function that retrieves main website address from the link"""
    if pd.isnull(link):
        return link
    
    link = link.replace("https://",'')
    link = link.replace("http://",'')
    link = link.split('/')
    if isinstance(link,list):
        return link[0]
    else:
        return link

In [None]:
# Reading image from url path for wordcloud generation
def read_img_from_url(url):
    """Returns np.array from url leading to image"""
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img_matrix = np.array(img)
    return img_matrix

In [None]:
# constants and functions that allow counting syllables in the word

VOWEL_RUNS = re.compile("[aeiouy]+", flags=re.I)
EXCEPTIONS = re.compile(
    # fixes trailing e issues:
    # smite, scared
    "[^aeiou]e[sd]?$|"
    # fixes adverbs:
    # nicely
    + "[^e]ely$",
    flags=re.I
)
ADDITIONAL = re.compile(
    # fixes incorrect subtractions from exceptions:
    # smile, scarred, raises, fated
    "[^aeioulr][lr]e[sd]?$|[csgz]es$|[td]ed$|"
    # fixes miscellaneous issues:
    # flying, piano, video, prism, fire, evaluate
    + ".y[aeiou]|ia(?!n$)|eo|ism$|[^aeiou]ire$|[^gq]ua",
    flags=re.I
)

def count_syllables(word):
    """Returns number of syllables in the word based on string"""
    vowel_runs = len(VOWEL_RUNS.findall(word))
    exceptions = len(EXCEPTIONS.findall(word))
    additional = len(ADDITIONAL.findall(word))
    return max(1, vowel_runs - exceptions + additional)

In [None]:
# Cleaning text
def lemma_txt(doc):
    "Function that returns lowercase, lemmatized text without punctuation"
    lemma_list = [token.lemma_ for token in doc if token.is_alpha and token.lemma_ != '-PRON-']
    return ' '.join(lemma_list)

# Calculating basic text statistics
def counter(doc):    
    """ Function that returns dictionary with text statistics:
        - count of chars
        - count of words
        - count of sentences
        - count of syllables
        - avg. count of words per sentence
        - avg. count of syllables per word
        - count of difficult words (with more than 2 syllables)
        - percentage of words with more than two syllables in the text"""
    char_list = np.array([len(token.text) for token in doc if token.is_alpha])
    syll_list = np.array([count_syllables(token.text) for token in doc if token.is_alpha])
    word_count = len(char_list)
    sent_count = len(list(doc.sents))
    char_count = char_list.sum()
    diff_word_count = np.sum(syll_list>3)
    syll_count = syll_list.sum()
    diff_word_perc = round(diff_word_count/word_count,2)*100
    syll_per_word = round(syll_count/word_count,2)
    word_per_sent = round(word_count/sent_count,2)
    
    counter_dict = {
        'Words' : word_count,
        'Sentences' : sent_count,
        'Chars' : char_count,
        'Syllables' : syll_count,
        'Diff_Words' : diff_word_count,
        'Diff_Words_Perc' : diff_word_perc,
        'Words_Per_Sent' : word_per_sent,
        'Syll_Per_Word' : syll_per_word,
    }
    
    return counter_dict

# Conting different Parts of Speech in text
def pos_counter(doc):
    """Functions that returns dictionary with count of different parts of speech in the text.
    - POS tags based on spacy package"""
    counts_dict = doc.count_by(spacy.attrs.IDS['TAG'])

    pos_dict = {}
    
    for i in nlp.tokenizer.vocab.morphology.tag_map.keys():
            if i != '_SP':
                pos_dict[spacy.explain(i)]=0
    
    # Create dict with the human readable part of speech tags
    for pos, count in counts_dict.items():
        tag = spacy.explain(doc.vocab[pos].text)
        pos_dict[tag] = count
        
    return pos_dict

# Counting number of different named entities in text
def ner_counter(doc):
    """Functions that returns dictionary with count of different named entities in the text.
    - NER tags based on spacy package"""
    ner_dict = {}
    for ner in GLOSSARY_NER:
        ner_dict[ner]=0
    
    for ent in doc.ents:
        ner_dict[ent.label_] += 1
    
    return ner_dict

In [None]:
#define function to expand contractions and showcase
def expand_contractions(s, contractions = contractions):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, s)


In [None]:
# Function that expands dataframe including text column with text statistics, POS counts, NER counts and "Clean Text"
def preprocess_text(df_data, txt_col='excerpt'):
    """Functions that preprocess text and expands DataFrame with relevant columns."""
    docs = nlp.pipe(df_data[txt_col].tolist())
    df_data['DOCS']=[doc for doc in list(docs)]
    df_data['Clean_Text']= df_data['DOCS'].apply(lemma_txt)
    
    df_data['Text_Stats']= df_data['DOCS'].apply(counter)
    df_data['POS_Stats'] = df_data['DOCS'].apply(pos_counter)
    df_data['NER_Stats'] = df_data['DOCS'].apply(ner_counter)
    
    dfs = [df_data.drop(labels=['Text_Stats','POS_Stats','NER_Stats','DOCS'],axis=1),
           pd.DataFrame(df_data['Text_Stats'].tolist()),
           pd.DataFrame(df_data['POS_Stats'].tolist()),
           pd.DataFrame(df_data['NER_Stats'].tolist())]
    
    return pd.concat(dfs, axis=1)

In [None]:
# Lists with columns created using preprocess_text function
basic_stats = ['Chars','Words','Syllables','Sentences','Syll_Per_Word','Words_Per_Sent','Diff_Words']
POS_stats = [spacy.explain(i) for i in nlp.tokenizer.vocab.morphology.tag_map.keys() if i != '_SP']
NER_stats = list(GLOSSARY_NER.keys())

all_stats = basic_stats + POS_stats + NER_stats

# LOADING DATA

#### Let's load the training data and take a look at first few rows. I will also check the data shape and if we are dealing with any nulls. 

In [None]:
# Loading data
data_train = pd.read_csv(train_path)
display(data_train.head())

In [None]:
# Checking size of data
print('TRAIN DATA')
print('Samples:',data_train.shape[0])
print('Columns:',data_train.shape[1])

In [None]:
# Checking data types and amount of nulls
display(data_train.info())

#### Let's load the test data and also take a look in first few rows


In [None]:
# Loading test data and showing first 5 rows
data_test = pd.read_csv(test_path)
display(data_test.head())

In [None]:
# Checking size of test data
print("TEST DATA:")
print('Samples:',data_test.shape[0])
print('Columns:',data_test.shape[1])

#### Preprocessing the train and creating columns with text statistics

In [None]:
# cleaning the text and preparing different statistics
data_train = preprocess_text(data_train)

# Retrieving main website from url link
data_train['website'] = data_train['url_legal'].apply(clean_link)

# EXPLORATORY DATA ANALYSIS

#### I will start the exploaratory analysis by reviewing the source website and license of available texts

In [None]:
# Count of source websites and license for texts in training data
fig, axis = plt.subplots(2, figsize=(10,12))

sns.countplot(y='website',hue='website', data=data_train, dodge=False, ax=axis[0])
axis[0].set_title('Source Website Count',fontsize=16)
axis[0].get_legend().remove()

sns.countplot(y='license', hue='license',data=data_train, dodge=False, ax=axis[1])
axis[1].set_title('License Count',fontsize=16)
axis[1].get_legend().remove()

fig.tight_layout()
plt.show()

#### Then I will explore target distribution and standard error

In [None]:
# Inspecting targer variable and standard error
display(data_train[['target','standard_error']].describe())

In [None]:
# Visualizing Target and Standard Error
fig, axis = plt.subplots(1,3,figsize=(14,5))

sns.histplot(x='target',kde=True, data=data_train,bins=100, ax=axis[0])
axis[0].set_title('Target Distribution', fontsize=16)

sns.histplot(x='standard_error',kde=True, data=data_train, ax=axis[1], color='darkred')
axis[1].set_title('Standard Error Distribution', fontsize=16)

sns.histplot(x='standard_error',kde=True, data=data_train.query('standard_error > 0.01'), ax=axis[2], color='darkred')
axis[2].set_title('Standard Error Distribution', fontsize=16)

plt.show()

It seems like in standard error there is one untypical observation equal 0 and rest of them lays between 0.4 and 0.7. <br> 
Target have distribution close to normal with mean -1 and is slightly skewed towards lower values

In [None]:
print('Std error above 0:',data_train[data_train['standard_error']>0].shape[0], 'samples')
print('Std error equal or below 0:',data_train[data_train['standard_error']<=0].shape[0], 'samples')

#### Let's review relationship between target and std error. It seems like the biggest std error is on the extremes.

In [None]:
# Plot of standard_error versus target - points colored by length of text
fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(x='target', y='standard_error',hue='Chars', data=data_train,
                alpha=0.5, ax=ax, palette='viridis_r')
ax.set_title('Standard Error vs Target', fontsize=16)
ax.set_ylim([0.4,0.7])
plt.show()

#### Let's review the examples for the lowest and highest values of target 

In [None]:
# The most difficult to read text
display(data_train.sort_values(by='target')[['target','excerpt']].head(1).values)

In [None]:
# The easiest to read text
display(data_train.sort_values(by='target', ascending=False)[['target','excerpt']].head(1).values)

#### Let's take a look at correlation between target and statistics 

In [None]:
# Correlation between POS stats and target
fig,ax = plt.subplots(figsize=(4,8))
ax.set_title('Correlation of Text Stats and Target', fontsize=16)
sns.heatmap(data_train[basic_stats+['target']].corr()[['target']].sort_values(by='target'),
            annot=True, fmt='.2f',
            vmin=-1, vmax=1,
            cmap='RdBu_r',
            ax=ax)
fig.tight_layout()
plt.show()

In [None]:
# Correlation between POS stats and target
fig,ax = plt.subplots(figsize=(3,20))
ax.set_title('Correlation of POS and Target', fontsize=16)
sns.heatmap(data_train[POS_stats+['target']].corr()[['target']].sort_values(by='target'),
            annot=True, fmt='.2f',
            vmin=-1, vmax=1,
            cmap='RdBu_r',
            ax=ax)
plt.show()

In [None]:
# Correlation between NER stats and target
fig,ax = plt.subplots(figsize=(3,10))
ax.set_title('Correlation of NER and Target', fontsize=16)
sns.heatmap(data_train[NER_stats+['target']].corr()[['target']].sort_values(by='target'),
            annot=True, fmt='.2f',
            vmin=-1, vmax=1,
            cmap='RdBu_r',
            ax=ax)
plt.show()

#### Let's plot relationship between basic text stats and target and review their distribution

In [None]:
# Plot of text length versus target 
fig, ax = plt.subplots( figsize=(5,5))

fig.suptitle("Text Statistics", fontsize=18)

sns.scatterplot(y='Words_Per_Sent', x='Syll_Per_Word',hue='target',
                data=data_train, ax=ax)
ax.set_title('Avg. Word Length vs Avg Sentence Length', fontsize=16)

fig.tight_layout()
plt.show()

In [None]:
# statistics of text data
display(data_train[basic_stats].describe())

In [None]:
# Variance over mean - index of dispersion
(data_train[basic_stats].var()/data_train[basic_stats].mean()).abs()

In [None]:
# Variance over mean - index of dispersion
fig, ax = plt.subplots()
sns.barplot(x=(data_train[basic_stats].var()/data_train[basic_stats].mean()).abs(),ax=ax,
                  y=basic_stats)
fig.suptitle("Index of Dispersion - Variance Over Mean", fontsize=16)
ax.set_yticklabels(labels=basic_stats)
plt.show()

In [None]:
fig,ax = plt.subplots(1,2, figsize=(10,5))

fig.suptitle('Distribution of Avg. Word and Sentence Length in Texts', fontsize=18)
ax[0].set_title('Avg. Sentence Length', fontsize=16)
sns.histplot(x='Words_Per_Sent', data=data_train, bins=100,kde=True, ax=ax[0], color='green')

ax[1].set_title('Avg. Word Length', fontsize=16)
sns.histplot(x='Syll_Per_Word', data=data_train,bins=100,kde=True, ax=ax[1], color='blue')

fig.tight_layout()
plt.show()

In [None]:
fig,ax = plt.subplots(1,4, figsize=(20,5))

fig.suptitle('Distribution of Chars, Words and Sentences Count in Texts', fontsize=18)

ax[0].set_title('Word Count', fontsize=16)
sns.histplot(x='Words', data=data_train, bins=40,kde=True, ax=ax[0], color='red')

ax[1].set_title('Sentence Count', fontsize=16)
sns.histplot(x='Sentences', data=data_train, bins=40,kde=True, ax=ax[1], color='yellow')

ax[2].set_title('Syllable Count', fontsize=16)
sns.histplot(x='Syllables', data=data_train, bins=40,kde=True, ax=ax[2], color='violet')

ax[3].set_title('Char Count', fontsize=16)
sns.histplot(x='Chars', data=data_train, bins=40,kde=True, ax=ax[3], color='blue')

fig.tight_layout()
plt.show()

# TEXT CONTENT

#### Now I'm gonna take closer look at content of texts - calculating word frequency and POS, NER frequency for whole corpus

## Word Clouds with word frequency

Word Cloud is useful and pretty way to see most common words

In [None]:
# Joining whole corpus to generate wordcloud
wc_data = ' '.join(data_train['Clean_Text'].tolist()).upper()

In [None]:
# instantiate a word cloud object
excerpt_cloud = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords.words('english'),
)
# generate the word cloud
excerpt_cloud.generate(wc_data);

In [None]:
# display the word cloud
fig, ax = plt.subplots(figsize=(14,7))
fig.suptitle('Word Cloud with Most Frequent Words', fontsize=20)
plt.imshow(excerpt_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Word Frequency

#### Let's calculate frequency for different ngrams

In [None]:
# Calculating word frequency - top 20 unigrams, bigrams and trigrams

# Unigrams
CV = CountVectorizer(stop_words=stopwords.words('english'),
                    ngram_range=(1,1),
                    max_features=20)
CV_excerpt = CV.fit_transform([wc_data])
top_uni = pd.DataFrame({'unigram':CV.get_feature_names(),'count':CV_excerpt.toarray()[0]})

# Bigrams
CV = CountVectorizer(stop_words=stopwords.words('english'),
                    ngram_range=(2,2),
                    max_features=20)
CV_excerpt = CV.fit_transform([wc_data])
top_bi = pd.DataFrame({'bigram':CV.get_feature_names(),'count':CV_excerpt.toarray()[0]})

# Trigrams
CV = CountVectorizer(stop_words=stopwords.words('english'),
                    ngram_range=(3,3),
                    max_features=20)
CV_excerpt = CV.fit_transform([wc_data])
top_tri = pd.DataFrame({'trigram':CV.get_feature_names(),'count':CV_excerpt.toarray()[0]})

In [None]:
fig, ax = plt.subplots(1,3, figsize=(16,7))
fig.suptitle('Most Frequent n-grams', fontsize=20)

sns.barplot(y='unigram',x='count', data=top_uni.sort_values(by='count'), palette='viridis', ax=ax[0])
ax[0].set_title('Unigrams', fontsize=16)
sns.barplot(y='bigram',x='count', data=top_bi.sort_values(by='count'), palette='magma', ax=ax[1])
ax[1].set_title('Bigrams', fontsize=16)
sns.barplot(y='trigram',x='count', data=top_tri.sort_values(by='count'), palette='inferno', ax=ax[2])
ax[2].set_title('Trigrams', fontsize=16)

fig.tight_layout()
plt.show()

## Part of Speech Statistics

In [None]:
# Display POS Statistics for dataset
fig, ax = plt.subplots(figsize=(10,14))
ax.set_title('Part of Speech Statistics for Training Data', fontsize=16)
data_train[POS_stats].sum().sort_values().plot.barh(ax=ax)
plt.show()

## Named Entity Statistics

In [None]:
# Display NER Statistics for dataset
fig, ax = plt.subplots(figsize=(10,12))
fig.suptitle('Part of Speech Statistics for Training Data')
data_train[NER_stats].sum().sort_values().plot.barh(ax=ax)
plt.show()

In [None]:
# Let's calculate sample weights based on standard error
data_train['sample_weight']= 1.6 - data_train['standard_error']

sns.scatterplot(y='sample_weight', x='standard_error', data=data_train)
plt.show()

# NEURAL NETWORK

#### Building basic neural network that will consider text stats and Tfidf Vectors

In [None]:
# Preparing Tfidf Vectorizer
Tfidf = TfidfVectorizer(stop_words = stopwords.words('english'), max_df=0.995, min_df=0.005)

In [None]:
# Splitting the data into train and test
X = data_train[basic_stats+NER_stats+POS_stats+['Clean_Text']]
y = data_train[['target','sample_weight']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fitting vectorizer
tfidf_train = Tfidf.fit_transform(X_train['Clean_Text'])
tfidf_test = Tfidf.transform(X_test['Clean_Text'])

# Preparing dataframes with features
tf_train_df = pd.concat([X_train.drop(labels='Clean_Text', axis=1),
                         pd.DataFrame(tfidf_train.toarray(),index=X_train.index, columns=Tfidf.get_feature_names())],
                         ignore_index=True,axis=1)

tf_test_df = pd.concat([X_test.drop(labels='Clean_Text', axis=1),
                        pd.DataFrame(tfidf_test.toarray(), index=X_test.index, columns=Tfidf.get_feature_names())],
                       ignore_index=True, axis=1)

In [None]:
scale = StandardScaler()
pca = PCA(n_components=0.99)

In [None]:
train_sc = scale.fit_transform(X_train[all_stats])
train_pca = pca.fit_transform(train_sc)

In [None]:
test_sc = scale.transform(X_test[all_stats])
test_pca = pca.transform(test_sc)

In [None]:
my_train = np.hstack([train_pca, tfidf_train.toarray()])
my_test = np.hstack([test_pca, tfidf_test.toarray()])

In [None]:
plt.bar(x=range(pca.explained_variance_ratio_.shape[0]), height=pca.explained_variance_ratio_)
plt.show()

In [None]:
# Building simple Sequential NN model
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(my_train.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1))

In [None]:
model.summary()

In [None]:
# Compiling the model with adam optimizer and huber loss
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
              loss=tf.keras.losses.huber,
              metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')])

In [None]:
# This callback will reduce learning rate if the model will get stuck
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.5),
             tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)]

In [None]:
my_train.shape

In [None]:
# Training the model
hist = model.fit(my_train, y_train['target'],
          batch_size=200,
          validation_data=(my_test, y_test['target']),
          sample_weight= y_train['sample_weight'],
          callbacks=callbacks,
          verbose=1,
          epochs=200)

In [None]:
metrics_v = pd.DataFrame(hist.history)[['rmse','val_rmse']]
metrics_v.plot()
plt.show()

In [None]:
loss_v = pd.DataFrame(hist.history)[['loss','val_loss']]
loss_v.plot()
plt.show()

In [None]:
y_pred = model.predict(my_test).reshape(-1)
RMSE = np.sqrt(mean_squared_error(y_test['target'], y_pred))
print('RMSE: ',RMSE)

In [None]:
fig,ax = plt.subplots()
sns.scatterplot(x=y_test['target'],y=y_pred, ax =ax , alpha=0.5)
ax.plot([-3.5,0,1.75],[-3.5,0,1.75],color='darkred')
ax.set_xlim([-4,2])
ax.set_ylim([-4,2])
plt.axis('Equal')
plt.show()

In [None]:
sns.jointplot(x=y_test['target'],y=y_pred, kind='resid')
plt.show()

# DUMMY REGRESSOR

#### For comparison I created dummy regressor which predicts mean

In [None]:
tr_mean = y_train['target'].mean()
y_brute_pred = np.ones_like(y_pred)*tr_mean

In [None]:
RMSE = np.sqrt(mean_squared_error(y_brute_pred, y_pred))
print('RMSE: ',RMSE)

# GENERATING SUBMISSION FILES

In [None]:
# Loading test data
data_test = pd.read_csv(test_path)

In [None]:
# predicting based on test set

# Text preprocessing
data_test = preprocess_text(data_test)

# Generating tfidf
X_new = data_test[basic_stats+NER_stats+POS_stats+['Clean_Text']]
new_test = Tfidf.transform(X_new['Clean_Text'])

# Preparing dataframes with features
new_sc = scale.transform(X_new[all_stats])
new_pca = pca.transform(new_sc)

new_feat = np.hstack([new_pca, new_test.toarray()])

preds = model.predict(new_feat).reshape(-1)

In [None]:
# Generating submission file
data_test['target']  = np.round(preds,2)
data_test[['id','target']].to_csv("submission.csv", index=False)

In [None]:
data_test[['id','target']]