In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import csv
import torch
import transformers
import sys
import os
import pickle
import spacy
import seaborn as sns
import textblob
import string
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings(action='ignore')


import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from sklearn.metrics import mean_squared_error
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.preprocessing import MinMaxScaler
from sklearn import model_selection
from sklearn import linear_model
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
TFIDF_MAX_FEATURES = 5000
GLOVE_DIMENSIONS = 100
GLOVE_INPUT_PATH = "../input/glove6b"
GLOVE_OUTPUT_PATH = "./glove6b"
GLOVE_VERSION = "glove.6B.100d"

In [None]:
full_dataset_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
full_dataset_df.info()

In [None]:
full_dataset_df["tokenized_excerpt"] = full_dataset_df["excerpt"].apply(lambda x: word_tokenize(x))
full_dataset_df["sentences_in_excerpt"] = full_dataset_df["excerpt"].apply(lambda x: sent_tokenize(x))

test_df["tokenized_excerpt"] = test_df["excerpt"].apply(lambda x: word_tokenize(x))
test_df["sentences_in_excerpt"] = test_df["excerpt"].apply(lambda x: sent_tokenize(x))

In [None]:
count_vectorizer = CountVectorizer(analyzer='word')

def generate_count_vectors(dataset):
    count_vectorizer.fit(dataset['excerpt'])
    count_vectors =  count_vectorizer.transform(dataset["excerpt"])
    return count_vectors

def generate_count_df(dataset, count_vectors):
    return pd.DataFrame(count_vectors.toarray(), index=dataset['id'].values, columns=count_vectorizer.get_feature_names())

In [None]:
class TFIDF_settings:    
    class word_level:
        max_features = TFIDF_MAX_FEATURES
        analyzer = "word"
        ngram_range = (1,1)
    
    class ngram_level:
        max_features = TFIDF_MAX_FEATURES
        analyzer = "word"
        ngram_range = (2,3)
    
    class character_level:
        max_features = TFIDF_MAX_FEATURES
        analyzer = "char"
        ngram_range = (2,3)

        
def generate_tfidf_vectors(dataset, settings):
    tfidf_vectorizer = TfidfVectorizer(analyzer=settings.analyzer, ngram_range=settings.ngram_range, max_features=settings.max_features)
    tfidf_vectorizer.fit(dataset['excerpt'])
    dataset_tfidf =  tfidf_vectorizer.transform(dataset["excerpt"])
    return dataset_tfidf

In [None]:
tfidf_vectors_word_level = generate_tfidf_vectors(dataset=full_dataset_df, settings=TFIDF_settings.word_level)
tfidf_vectors_ngram_level = generate_tfidf_vectors(dataset=full_dataset_df, settings=TFIDF_settings.ngram_level)
tfidf_vectors_character_level = generate_tfidf_vectors(dataset=full_dataset_df, settings=TFIDF_settings.character_level)

In [None]:
def add_experimental_features(dataset):
    dataset['char_count'] = dataset['excerpt'].apply(len)
    dataset['word_count'] = dataset['excerpt'].apply(lambda x: len(x.split()))
    dataset['word_density'] = dataset['char_count'] / (dataset['word_count']+1)
    dataset['sentence_count'] = dataset['sentences_in_excerpt'].apply(len)
    dataset['sentence_average_lenght'] = dataset['char_count'] / (dataset['sentence_count']+1)    
    dataset['punctuation_count'] = dataset['excerpt'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
    dataset['title_word_count'] = dataset['excerpt'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
    dataset['upper_case_word_count'] = dataset['excerpt'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
    
    pos_family = {
        'noun' : ['NN','NNS','NNP','NNPS'],
        'pron' : ['PRP','PRP$','WP','WP$'],
        'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
        'adj' :  ['JJ','JJR','JJS'],
        'adv' : ['RB','RBR','RBS','WRB']
    }

    # function to check and get the part of speech tag count of a words in a given sentence
    def check_pos_tag(x, flag):
        cnt = 0
        try:
            wiki = textblob.TextBlob(x)
            for tup in wiki.tags:
                ppo = list(tup)[1]
                if ppo in pos_family[flag]:
                    cnt += 1
        except:
            pass
        return cnt

    dataset['noun_count'] = dataset['excerpt'].apply(lambda x: check_pos_tag(x, 'noun'))
    dataset['verb_count'] = dataset['excerpt'].apply(lambda x: check_pos_tag(x, 'verb'))
    dataset['adj_count'] = dataset['excerpt'].apply(lambda x: check_pos_tag(x, 'adj'))
    dataset['adv_count'] = dataset['excerpt'].apply(lambda x: check_pos_tag(x, 'adv'))
    dataset['pron_count'] = dataset['excerpt'].apply(lambda x: check_pos_tag(x, 'pron'))
    
    return dataset
    
full_dataset_df = add_experimental_features(full_dataset_df)

In [None]:
test_df = add_experimental_features(test_df)

In [None]:
full_dataset_df

In [None]:
training_dataset_X = full_dataset_df.loc[:,['char_count', 'word_count', 'word_density',
       'punctuation_count', 'title_word_count', 'upper_case_word_count',
       'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count',
       'sentence_count', 'sentence_average_lenght']]
training_dataset_Y = full_dataset_df.loc[:,['target']]
test_dataset_X = test_df.loc[:,['char_count', 'word_count', 'word_density',
       'punctuation_count', 'title_word_count', 'upper_case_word_count',
       'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count',
       'sentence_count', 'sentence_average_lenght']]

training_dataset_X = training_dataset_X.set_index(full_dataset_df["id"])
training_dataset_Y = training_dataset_Y.set_index(full_dataset_df["id"])
test_dataset_X = test_dataset_X.set_index(test_df["id"])

In [None]:
# Shouldn't I use a different scaler?
min_max_scaler_X = MinMaxScaler()
min_max_scaler_Y = MinMaxScaler()

training_dataset_X_scaled = min_max_scaler_X.fit_transform(training_dataset_X)
training_dataset_Y_scaled = min_max_scaler_Y.fit_transform(training_dataset_Y)

test_dataset_X_scaled = min_max_scaler_X.transform(test_dataset_X)

# min_max_scaler_Y.fit(training_dataset.loc[:,"target"].to_numpy().reshape(-1,1))

In [None]:
training_dataset_X_scaled = pd.DataFrame(training_dataset_X_scaled, index=full_dataset_df["id"], columns=training_dataset_X.columns)
test_dataset_X_scaled = pd.DataFrame(test_dataset_X_scaled, index=test_df["id"], columns=test_dataset_X.columns)

In [None]:
def tune_hyperparameters(model, X, y, hyperparameters_grid):
    clf = GridSearchCV(model, hyperparameters_grid)
    clf.fit(X, y)
    return clf

In [None]:
parameters = {'alpha': np.linspace(0, 4, 80), 'max_iter' : np.arange(500,2500,200)}
# parameters_SGD = {'max_iter': np.arangenge(200,2000,200), 'early_stopping'=True}

lasso_clf = tune_hyperparameters(linear_model.Lasso(), training_dataset_X_scaled, training_dataset_Y_scaled, parameters) 
ridge_clf = tune_hyperparameters(linear_model.Ridge(), training_dataset_X_scaled, training_dataset_Y_scaled, parameters) 

In [None]:
parameters_elastic = {"alpha" : np.linspace(0, 5, 50)}

elastic_clf = tune_hyperparameters(linear_model.LassoLars(), training_dataset_X_scaled, training_dataset_Y_scaled, parameters_elastic) 

In [None]:
for clf in [ridge_clf, lasso_clf, elastic_clf]:
    print(clf.best_estimator_)
    print(clf.best_score_)

In [None]:
# prepare models
models = []
models.append(("Linear", linear_model.LinearRegression()))
models.append(("Lasso", linear_model.Lasso(alpha=0)))
models.append(('Ridge', linear_model.Ridge(alpha=0.759)))
models.append(('SGD', linear_model.SGDRegressor(max_iter=1000, tol=1e-3)))
models.append(("ElasticNet", linear_model.LassoLars(alpha=0)))

rmse_scorer = make_scorer(mean_squared_error)

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, training_dataset_X_scaled, training_dataset_Y_scaled.ravel(), cv=kfold, scoring=rmse_scorer)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

    # boxplot algorithm comparison
    
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(training_dataset_X_scaled, training_dataset_Y_scaled, test_size=0.25)   

In [None]:
def validate_model(model):
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    denormalized_prediction = min_max_scaler_Y.inverse_transform(prediction)
    denormalized_y = min_max_scaler_Y.inverse_transform(y_test)
    return mean_squared_error(denormalized_prediction, denormalized_y)

In [None]:
# linear
lin_result = validate_model(linear_model.LinearRegression())

# ridge 
ridge_result = validate_model(linear_model.Ridge(alpha=0.759, max_iter=500))

# lasso
# lasso_result = validate_model(linear_model.Lasso())

print(lin_result,
     ridge_result)