In [None]:
! pip install --no-index --no-deps   ../input/textstat-pypi/textstat-0.7.0-py3-none-any.whl
! pip install --no-index --no-deps   ../input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl 

In [None]:
import pandas as pd
import math
import spacy
from textstat.textstat import textstatistics, legacy_round
import numpy as np

In [None]:
## import data
train_df=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample_df=pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
nlp = spacy.load('en_core_web_sm')

In [None]:
# nlp features
def nlp_text(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    return doc

def break_sentences(doc):
    return list(doc.sents)

# Returns Number of Words in the text
def word_count(sentences):
    words = 0
    for sentence in sentences:
        words += len([token for token in sentence])
    return words

# Returns the number of sentences in the text
def sentence_count(sentences):
    return len(sentences)

# Returns average sentence length
def avg_sentence_length(sentences):
    words = word_count(sentences)
    sentences = sentence_count(sentences)
    average_sentence_length = float(words / sentences)
    return average_sentence_length

# Returns max sentence length
def max_sentence_length(sentences):
    max_length=0
    for sentence in sentences:
        num=word_count(str(sentence))
        if num>max_length:
            max_length=num
    return max_length

#Return syllables count
def syllables_count(word):
    return textstatistics().syllable_count(word)

# Returns the average number of syllables per
# word in the text
def avg_syllables_per_word(text,sentences):
    syllable = syllables_count(text)
    words = word_count(sentences)
    ASPW = float(syllable) / float(words)
    return legacy_round(ASPW, 1)

# word letter count
def avg_word_length_count(sentences):
    words = []
    for sentence in sentences:
        words += [str(token) for token in sentence]
    word_length = 0
    for word in words:
        word_length+=len(word)
    return word_length/word_count(sentences)

# Return total Difficult Words in a text
def difficult_words(sentences, nlp,difficulty_set):
    # difficulty_set is a integer, usually set as 2 or above
    # Find all words in the text
    words = []
    for sentence in sentences:
        words += [str(token) for token in sentence]
    # difficult words are those with syllables >= 2
    # easy_word_set is provide by Textstat as
    # a list of common words
    diff_words_set = set()
    for word in words:
        syllable_count = syllables_count(word)
        if word not in nlp.Defaults.stop_words and syllable_count >= difficulty_set:
            #difficulty_set is a integer, usually set as 2 or above
            diff_words_set.add(word)
    return len(diff_words_set)

In [None]:
train_data=[]
for text in train_df.excerpt:
    doc = nlp(text)
    sentences = list(doc.sents)
    total_sentence = len(sentences)
    avg_sentence_len = avg_sentence_length(sentences)
    # max_sentence_len = max_sentence_length(text)
    total_sylablle = syllables_count(text)
    total_words = word_count(sentences)
    avg_syllable = float(total_sylablle) / float(total_words) #avg_syllables_per_word(text,sentences)
    avg_letter = avg_word_length_count(sentences)
    total_polysyllables = difficult_words(sentences, nlp,2)
    dif_word_3=difficult_words(sentences, nlp,3)
    #dif_word_4=difficult_words(text,4)
    fun_smog_index=1.043*math.sqrt(total_polysyllables*30/total_sentence)+3.129
    fun_kincaid=0.39*total_words/total_sentence+11.8*total_polysyllables/total_words-15.59
    fun_readability=4.71*avg_letter+0.5*total_words/total_sentence-21.4
    fun_coleman_liau=5.88*avg_letter+29.6*total_sentence/total_words-15.8
    fun_dale_call=15.8*dif_word_3/total_words+0.0496*total_words/total_sentence
    fun_flesch=206.8-1.02*total_words/total_sentence-84.6*total_sylablle/total_words
    fun_gunning_fog=0.4*total_words/total_sentence-100*dif_word_3/total_words

    text_feature=[total_sentence,avg_sentence_len,total_sylablle,total_words,avg_syllable,avg_letter,total_polysyllables,dif_word_3,fun_smog_index,fun_kincaid,fun_readability,fun_coleman_liau,fun_dale_call,fun_flesch,fun_gunning_fog]
    train_data.append(text_feature)
print("All success!")

In [None]:
test_data=[]
for text in test_df.excerpt:
    doc = nlp(text)
    sentences = list(doc.sents)
    total_sentence = len(sentences)
    avg_sentence_len = avg_sentence_length(sentences)
    # max_sentence_len = max_sentence_length(text)
    total_sylablle = syllables_count(text)
    total_words = word_count(sentences)
    avg_syllable = float(total_sylablle) / float(total_words) #avg_syllables_per_word(text,sentences)
    avg_letter = avg_word_length_count(sentences)
    total_polysyllables = difficult_words(sentences, nlp,2)
    dif_word_3=difficult_words(sentences, nlp,3)
    #dif_word_4=difficult_words(text,4)
    fun_smog_index=1.043*math.sqrt(total_polysyllables*30/total_sentence)+3.129
    fun_kincaid=0.39*total_words/total_sentence+11.8*total_polysyllables/total_words-15.59
    fun_readability=4.71*avg_letter+0.5*total_words/total_sentence-21.4
    fun_coleman_liau=5.88*avg_letter+29.6*total_sentence/total_words-15.8
    fun_dale_call=15.8*dif_word_3/total_words+0.0496*total_words/total_sentence
    fun_flesch=206.8-1.02*total_words/total_sentence-84.6*total_sylablle/total_words
    fun_gunning_fog=0.4*total_words/total_sentence-100*dif_word_3/total_words

    text_feature=[total_sentence,avg_sentence_len,total_sylablle,total_words,avg_syllable,avg_letter,total_polysyllables,dif_word_3,fun_smog_index,fun_kincaid,fun_readability,fun_coleman_liau,fun_dale_call,fun_flesch,fun_gunning_fog]
    test_data.append(text_feature)
print("success!")

In [None]:
X = np.array(train_data)
Y = np.array(train_df['target'])
X_test = np.array(test_data)

Xt = np.transpose(X)
XtX = np.dot(Xt,X)
Xty = np.dot(Xt,Y)
beta = np.linalg.solve(XtX,Xty)

In [None]:
predictions = []
for i in range(len(X_test)):
    data=X_test[i,...]
    prediction = np.dot(data,beta)
    predictions.append(prediction)
    print(prediction)


In [None]:
sample_df.target=predictions
print(sample_df)
sample_df.to_csv("submission.csv", index=False)