In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline


from wordcloud import WordCloud


from collections import Counter
import os
import numpy as np
import re
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams    

import html
import unicodedata

stop_words = stopwords.words('english')
%config InlineBackend.figure_format = 'retina'


In [None]:
def wordcloud(text,ngram=1):
    wordcloud = WordCloud(width=1400, 
                            height=800,
                            random_state=2021,
                            background_color='black',
                            )
    if ngram ==1:
        wordc = wordcloud.generate(' '.join(text))
    else:
        wordc = wordcloud.generate_from_frequencies(text)
    plt.figure(figsize=(12,6), facecolor='k')
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.tight_layout(pad=0)
      

def get_n_grans_count(text, n_grams, min_freq):
    output = {}
    tokens = nltk.word_tokenize(text)

    #Create the n_gram
    if n_grams == 2:
        gs = nltk.bigrams(tokens)
        
    elif n_grams == 3:
        gs = nltk.trigrams(tokens)

    else:
        return 'Only 2_grams and 3_grams are supported'
    
    # compute frequency distribution for all the bigrams in the text by threshold with min_freq
    fdist = nltk.FreqDist(gs)
    for k,v in fdist.items():
        if v > min_freq:
            index = ' '.join(k)
            output[index] = v
    
    return output
    
def remove_special_chars(text):
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x1))


def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


def to_lowercase(text):
    return text.lower()



def remove_punctuation(text):
    """Remove punctuation from list of tokenized words"""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)


def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    return re.sub(r'\d+', '', text)


def remove_whitespaces(text):
    return text.strip()


def remove_stopwords(words, stop_words):
    """
    :param words:
    :type words:
    :param stop_words: from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
    or
    from spacy.lang.en.stop_words import STOP_WORDS
    :type stop_words:
    :return:
    :rtype:
    """
    return [word for word in words if word not in stop_words]


def stem_words(words):
    """Stem words in text"""
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

def lemmatize_words(words):
    """Lemmatize words in text, and by defult lemmatize nouns"""

    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def lemmatize_verbs(words):
    """Lemmatize verbs in text"""

    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])

def text2words(text):
    return word_tokenize(text)

def normalize_text( text):
    text = remove_special_chars(text)
    text = remove_non_ascii(text)
    text = remove_punctuation(text)
    text = to_lowercase(text)
    text = replace_numbers(text)
    words = text2words(text)
    words = remove_stopwords(words, stop_words)
    #words = stem_words(words)# Either stem or lemmatize
    words = lemmatize_words(words)
    words = lemmatize_verbs(words)

    return ''.join(words)

**Goal:building algorithms to rate the complexity of reading passages for grade 3-12 classroom use**

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

train_data.head()

## EDA

In [None]:
train_data.describe()

In [None]:
# showing if any feature has at least one null value
train_data.isnull().any()

In [None]:
# count null values
train_data.isnull().sum()

## Target and Standard Error Distributions

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,7))
sns.histplot(train_data['target'], kde= True, ax=ax[0])
sns.histplot(train_data['standard_error'], kde= True, ax=ax[1])
ax[0].set_title("Target Distribution")
ax[1].set_title("Standard Error Distribution")
plt.show();

## license

In [None]:
train_data['license'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data= train_data, y= 'license')
plt.title('License Distribution')
plt.show();

## Url legal

In [None]:
# showing the shaper of url's
urls = train_data['url_legal'].dropna()
urls = [url for url in urls]
urls[:5]

In [None]:
# Extract all url's
url_list = train_data['url_legal'].dropna().apply(lambda x : re.findall('https?://([A-Za-z_0-9.-]+).*',x)[0])
url_list = [url for url in url_list]
url_list[:10]

In [None]:
# count url's and sort them descending order 
urls_counts = Counter(url_list)
urls_counts_sorted = sorted(urls_counts.items(), key=lambda pair: pair[1], reverse=True)
urls_counts_df = pd.DataFrame(urls_counts_sorted, columns=['sites', 'counts'])
urls_counts_df

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data= urls_counts_df, x= 'counts', y= 'sites')
plt.title('Unique Sites count')
plt.show();

## excerpt

**The original text**

In [None]:
train_data['excerpt'][0]

**The cleaned text**

In [None]:
normalize_text(train_data['excerpt'][0])

**Adding cleat text in the data frame**

In [None]:
train_data['clean_text'] = [normalize_text(sent) for sent in train_data['excerpt']]
train_data.head()

In [None]:
# Also we should make text preprocessing on text data
test_data['excerpt'] = [normalize_text(sent) for sent in test_data['excerpt']]

**Frequent words**

In [None]:
# make all clear sentence as a huge text, then tokenize it
words_list = text2words(''.join(sents for sents in train_data['clean_text']))
words_list[:10]


In [None]:
# Number of words we have
len(words_list)

In [None]:
# frequent of the most 30 words
words_list_freq = Counter(words_list)
words_list_freq_sorted = sorted(words_list_freq.items(), key=lambda pair: pair[1], reverse=True)

words_list_freq_sorted_df = pd.DataFrame(words_list_freq_sorted, columns=['words', 'counts'])[:30]
words_list_freq_sorted_df.head()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data= words_list_freq_sorted_df, y= 'words', x= 'counts')
plt.title('Top 30 frequent words')
plt.show();

**Word Cloud for all words**

In [None]:
wordcloud(train_data['excerpt'])

**Bigrams**

In [None]:
text= ' '.join(setns for setns in train_data['clean_text'])

In [None]:
two_grams = get_n_grans_count(text, n_grams=2, min_freq=10)
two_grams_df = pd.DataFrame(two_grams.items(), columns= ['two_grams', 'counts']).sort_values(by='counts',ascending=False)
two_grams_df.head()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data= two_grams_df[:30], y= 'two_grams', x= 'counts')
plt.title('Top 30 frequent bigram')
plt.show();

In [None]:
two_grams_wordcloud = {w.replace(' ','_'): c for w,c in two_grams.items()}
wordcloud(two_grams_wordcloud,ngram=2)

### Modeling

Before going further it is important that we split the data into training and validation sets. We can do it using train_test_split from the model_selection module of scikit-learn.

In [None]:
X = train_data['clean_text']
y = train_data['target']
X_train, X_valid, y_train, y_valid =  train_test_split(X, y, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

In [None]:
print (X_train.shape)
print (X_valid.shape)

**Our first model is a simple TF-IDF (Term Frequency - Inverse Document Frequency) followed by a simple Logistic Regression.**

In [None]:
# Make an Sklearn pipeline for this Ridge Regression
ridge = Ridge(fit_intercept=True, normalize=False)

ridge_pipline = make_pipeline(
    TfidfVectorizer(binary= True, min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)
    , ridge)

# training
ridge_pipline.fit(X_train, y_train)

# Evaluation
y_pred = ridge_pipline.predict(X_valid)
mse_loss = mean_squared_error(y_pred, y_valid)

print(f"MSE Loss using Ridge and TfIdfVectorizer: {mse_loss}")

In [None]:
# Make an Sklearn pipeline for this xgboost Regression
xgboost = xgb.XGBRegressor(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)

xgboost_pipline = make_pipeline(
    TfidfVectorizer(binary= True, min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)
    , xgboost)

# training
xgboost_pipline.fit(X_train, y_train)

# Evaluation
y_pred = xgboost_pipline.predict(X_valid)
mse_loss = mean_squared_error(y_pred, y_valid)

print(f"MSE Loss using xgboost and TfIdfVectorizer: {mse_loss}")

**Seems like no luck with XGBoost!**

### Submission

In [None]:
test_text = test_data['excerpt']
test_pred = ridge_pipline.predict(test_text)

submission = pd.DataFrame()
submission['id'] = test_data['id']
submission['target'] = test_pred
submission.to_csv("submission.csv", index=False)
submission