In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from stop_words import get_stop_words
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import nltk 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#all functions 
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def NounCount(string):
    is_noun = lambda pos: pos == 'NOUN'
    tokenized = nltk.word_tokenize(string)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized,tagset='universal') if is_noun(pos)]
    return (len(nouns))

def VerbCount(string):
    is_verb = lambda pos: pos == 'VERB'
    tokenized = nltk.word_tokenize(string)
    verbs = [word for (word, pos) in nltk.pos_tag(tokenized,tagset='universal') if is_verb(pos)]
    return (len(verbs))

def AdjCount(string):
    is_adj = lambda pos: pos == 'ADJ'
    tokenized = nltk.word_tokenize(string)
    adjs = [word for (word, pos) in nltk.pos_tag(tokenized,tagset='universal') if is_adj(pos)]
    return (len(adjs))

def PronounCount(string):
    is_pron = lambda pos: pos == 'PRON'
    tokenized = nltk.word_tokenize(string)
    pron = [word for (word, pos) in nltk.pos_tag(tokenized,tagset='universal') if is_pron(pos)]
    return (len(pron))

def IndependentClause(string):
    tockentest = nltk.word_tokenize(string)
    testlist = [[word,pos] for (word, pos) in nltk.pos_tag(tockentest,tagset='universal')]
    Clausecounter = 0
    for i in range(len(testlist)-1):
        if testlist[i][1] == 'NOUN' and testlist[i+1][1] == 'VERB':
            Clausecounter +=1
    return(Clausecounter)

def leaves(tree):
    NPCount = 0
    VPCount = 0
    """Finds NP  and VP in leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label() == 'NP'):
        NPCount += 1
    for subtree in tree.subtrees(filter = lambda t: t.label() == 'VP'):
        VPCount += 1
    return(NPCount,VPCount)

def Phrases(excerpt):
    '''
    Function will take a passage, find the mean Noun Phrases (NP) per sentense, mean Verb Phrases (VP) per sentence, 
    mean phrases per sentence and proportion of sentences without VP
    Idea from this paper: https://www.cs.utexas.edu/~ml/papers/kate.coling10.pdf
    '''
    document = excerpt.replace('\n','').split('.')
    document = [x for x in document if len(x) > 1]
    Results = []
    for doc in document:
        tokens = [nltk.word_tokenize(sent) for sent in [doc]]
        postag = [nltk.pos_tag(sent) for sent in tokens][0]
        # Rule for NP chunk and VB Chunk
        grammar = r"""
            NBAR:
                {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
            VP:
                {<RB.?>*<VB.?>*<JJ>*<VB.?>+<VB>?} # Verbs and Verb Phrases        
        """
        #Chunking
        cp = nltk.RegexpParser(grammar)

        # the result is a tree
        tree = cp.parse(postag)
        #print(leaves(tree))
        Results.append(leaves(tree))
    sents = len(Results)
    NP = 0
    VP = 0
    Miss = 0
    Total = 0
    for i in Results:
        NP += i[0]
        VP += i[1]
        if i[1] == 0:
            Miss +=1
        Total += NP + VP
    return pd.Series([NP/sents,VP/sents,Miss/sents,Total/sents])

def GrunningFog(excerpt):
    """
    function takes a passage and determines the grade level based on the Grunning Fog index method
    """
    document = excerpt
    document = document.replace('\n',' ').split('.')
    document = [x for x in document if len(x)>1]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = []
    ComplexCount = []
    for sentence in document:
        tokens = nltk.word_tokenize(sentence)
        words.append(len(tokens))
        tokens = [lemmatizer.lemmatize(x) for x in tokens]
        Complex = [1 if syllable_count(token) >=3 else 0 for token in tokens]
        ComplexCount.append(np.sum(Complex))
    ASL = np.mean(words) #Average words per sentence
    PropComplex = np.sum(ComplexCount)/np.sum(words) #proprtion of complex words (>= 3 sylables)
    GrunFog = 0.4*(ASL + (100*PropComplex))
    return(GrunFog)

def SMOG(excerpt):
    document = excerpt
    document = document.replace('\n',' ').split('.')
    document = [x for x in document if len(x)>1]
    words = []
    ComplexCount = []
    for sentence in document:
        tokens = nltk.word_tokenize(sentence)
        words.append(len(tokens))
        Complex = [1 if syllable_count(token) >=3 else 0 for token in tokens]
        ComplexCount.append(np.sum(Complex))
    SMOGScore = (1.0430 * np.sqrt(np.sum(ComplexCount) * (30/len(words)))) + 3.1291
    return(SMOGScore)

In [None]:
trainSet = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
testSet = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
print(trainSet.info())
print(testSet.info())
print(sample.info())

In [None]:
fig,ax = plt.subplots(nrows = 2)
trainSet.target.hist(ax = ax[0])
trainSet.standard_error.hist(ax = ax[1])
plt.show()

In [None]:
trainSet.head(5)

In [None]:
#Feature Engineering
stop_words = get_stop_words('english')
stop_words = [x.upper() for x in stop_words]
trainSet['Paragraphs'] = trainSet['excerpt'].apply(lambda row: len(row.split('\n')))
trainSet['Words'] = trainSet['excerpt'].apply(lambda row: len(row.replace('\n',' ').split(' ')))
trainSet['AvgWordsPerPar'] = trainSet['excerpt'].apply(lambda row: np.mean([len(x.split(' ')) for x in row.split('\n')]))
trainSet['AvgSentPerPar'] = trainSet['excerpt'].apply(lambda row: np.mean([len(x.split('.')) for x in row.split('\n')]))
trainSet['ASL'] = trainSet['excerpt'].apply(lambda row: np.sum([len(x.split(' ')) for x in row.replace('\n','').split('.')])/len([len(x.split(' ')) for x in row.replace('\n','').split('.')]))
trainSet['ASW'] = trainSet['excerpt'].apply(lambda row: np.sum([syllable_count(x) if len(x)>0 else 0 for x in row.replace('\n','').replace('.','').split(' ')])/len([x for x in row.replace('\n','').replace('.','').split(' ')]))
trainSet['FleschEase'] = trainSet.apply(lambda row: 206.835 - (1.015 * row['ASL']) - (84.6 * row['ASW']),axis = 1)
trainSet['NonStopWords'] = trainSet['excerpt'].apply(lambda row: len([x for x in row.upper().replace('\n',' ').split(' ') if x not in stop_words]))
trainSet['StopWords'] = trainSet['excerpt'].apply(lambda row: len([x for x in row.upper().replace('\n',' ').split(' ') if x in stop_words]))
trainSet['ratio'] = trainSet['NonStopWords'] / trainSet['StopWords']
trainSet['Nouns'] = trainSet['excerpt'].apply(lambda row: NounCount(row))
trainSet['Verbs'] = trainSet['excerpt'].apply(lambda row: VerbCount(row))
trainSet['Adjectives'] = trainSet['excerpt'].apply(lambda row: AdjCount(row))
trainSet['Pronouns'] = trainSet['excerpt'].apply(lambda row: PronounCount(row))
trainSet['ClausePerSentence'] = trainSet['excerpt'].apply(lambda row: np.mean([IndependentClause(sentence) for sentence in row.replace('\n','').split('.')]))
trainSet[['AvgNP','AvgVP','AvgMissingVP','AvgAllPhrase']] = trainSet['excerpt'].apply(lambda row: Phrases(row))
trainSet['GrunFog'] = trainSet['excerpt'].apply(lambda row: GrunningFog(row))
trainSet['SMOG'] = trainSet['excerpt'].apply(lambda row: SMOG(row))
print(trainSet.head(10))

In [None]:
sns.pairplot(trainSet)

In [None]:
#try to model something
x_train,x_test,y_train,y_test = train_test_split(trainSet.drop(columns = ['id','url_legal','license','excerpt','target','standard_error']),trainSet['target'],train_size = 0.75)
lgb_train = lgb.Dataset(x_train,y_train, free_raw_data=False)

lgbm = lgb.LGBMRegressor(boosting_type='gbdt',objective='regression',importance_type= 'gain',n_estimators = 500)
parameters = {'max_depth':[4],'learning_rate':[0.01],'num_leaves':[2**4]}
def rmse(actual,pred):
    mse = metrics.mean_squared_error(actual,pred)
    RMSE = np.sqrt(mse)
    return(RMSE)
    
rmse = metrics.make_scorer(rmse,greater_is_better=False)
gbm = GridSearchCV(lgbm,parameters,cv = 5,scoring = rmse)
gbm.fit(x_train,y_train,eval_set = (x_train,y_train),early_stopping_rounds = 5,verbose = False)
print(f'Best model score is {gbm.best_score_}')
print(f'Best model parameters are\n{gbm.best_params_}')
print(f'Score on the test hold out is {gbm.score(x_test,y_test)}')


In [None]:
#model evaluation 
fig,ax = plt.subplots()
lgb.plot_importance(gbm.best_estimator_.booster_,ax = ax,
                    importance_type='gain',
                    max_num_features=15)
ax.set_yticklabels(ax.get_yticklabels(),fontsize = 7)
plt.title('Variable Importance')
plt.show()

import shap
explainer = shap.TreeExplainer(model = gbm.best_estimator_,
                               data = None,
                               model_output = 'raw',
                               feature_perturbation='tree_path_dependent')
shap_values = explainer(x_train)
explainer.shap_values(x_train)
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0],show = False)
f1 = plt.gcf()
for ax in f1.get_axes():
      ax.tick_params(axis='both', labelsize=7)
plt.show()

shap.plots.beeswarm(shap_values,show = False)
f2 = plt.gcf()
for ax in f2.get_axes():
      ax.tick_params(axis='both', labelsize=7)
plt.show()

In [None]:
#Predict on test set
testSet['Paragraphs'] = testSet['excerpt'].apply(lambda row: len(row.split('\n')))
testSet['Words'] = testSet['excerpt'].apply(lambda row: len(row.replace('\n',' ').split(' ')))
testSet['AvgWordsPerPar'] = testSet['excerpt'].apply(lambda row: np.mean([len(x.split(' ')) for x in row.split('\n')]))
testSet['AvgSentPerPar'] = testSet['excerpt'].apply(lambda row: np.mean([len(x.split('.')) for x in row.split('\n')]))
testSet['ASL'] = testSet['excerpt'].apply(lambda row: np.sum([len(x.split(' ')) for x in row.replace('\n','').split('.')])/len([len(x.split(' ')) for x in row.replace('\n','').split('.')]))
testSet['ASW'] = testSet['excerpt'].apply(lambda row: np.sum([syllable_count(x) if len(x)>0 else 0 for x in row.replace('\n','').replace('.','').split(' ')])/len([x for x in row.replace('\n','').replace('.','').split(' ')]))
testSet['FleschEase'] = testSet.apply(lambda row: 206.835 - (1.015 * row['ASL']) - (84.6 * row['ASW']),axis = 1)
testSet['NonStopWords'] = testSet['excerpt'].apply(lambda row: len([x for x in row.upper().replace('\n',' ').split(' ') if x not in stop_words]))
testSet['StopWords'] = testSet['excerpt'].apply(lambda row: len([x for x in row.upper().replace('\n',' ').split(' ') if x in stop_words]))
testSet['ratio'] = testSet['NonStopWords'] / testSet['StopWords']
testSet['Nouns'] = testSet['excerpt'].apply(lambda row: NounCount(row))
testSet['Verbs'] = testSet['excerpt'].apply(lambda row: VerbCount(row))
testSet['Adjectives'] = testSet['excerpt'].apply(lambda row: AdjCount(row))
testSet['Pronouns'] = testSet['excerpt'].apply(lambda row: PronounCount(row))
testSet['ClausePerSentence'] = testSet['excerpt'].apply(lambda row: np.mean([IndependentClause(sentence) for sentence in row.replace('\n','').split('.')]))
testSet[['AvgNP','AvgVP','AvgMissingVP','AvgAllPhrase']] = testSet['excerpt'].apply(lambda row: Phrases(row))
testSet['GrunFog'] = testSet['excerpt'].apply(lambda row: GrunningFog(row))
testSet['SMOG'] = testSet['excerpt'].apply(lambda row: SMOG(row))

In [None]:
testSet.head(10)

In [None]:
y_pred = gbm.predict(testSet.drop(columns = ['id','url_legal','license','excerpt']))
print(pd.concat([testSet['id'],pd.Series(y_pred)],axis = 1, keys = ['id','target']))
pd.concat([testSet['id'],pd.Series(y_pred)],axis = 1, keys = ['id','target']).to_csv('submission.csv',index = False)