In [None]:
import numpy as np 
import pandas as pd 
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Input, Dropout
from tensorflow.python.keras.layers.wrappers import TimeDistributed
from tensorflow.python.keras.layers.recurrent import LSTM

In [None]:
df_train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df_test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
df_sample = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

# Visualize some data

In [None]:
df_train['excerpt'][0]

In [None]:
df_train = df_train.drop(columns=['url_legal', 'license'])
df_train.head()

In [None]:
df_test = df_test.drop(columns=['url_legal', 'license'])
df_test.head()

In [None]:
# Max length of phrases

max_length_training = max(df_train.apply(lambda x : len(x["excerpt"]), axis=1))
max_length_testing = max(df_test.apply(lambda x : len(x["excerpt"]), axis=1))

print("Max length of the sentences :")
print("Training : ", max_length_training, " - Testing : ", max_length_testing)

# Preprocess the data

In order to preprocess the data, we are going to :

- Word tokenize : we want to break down the sentence to get the words that compose it.
- To lower case : normalize each word.
- Remove punctuations/digits.
- Remove stopwords : remove non significative words.
- Stemming : get the word stem, the root form of the word. (Example : fishing, fished, fisher => fish)
- Lemmatized : get the lemma of the word.

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    
    # Extract all the words in the phrase : get a list 
    tokens = word_tokenize(text)
    
    # Lowercase the words
    tokens = [word.lower() for word in tokens]
    
    # Remove all tokens that are not alphabetic
    words = [word for word in tokens if word.isalpha()]
    
    # Remove word in the stop word
    words = [word for word in words if not word in stop_words]

    # Get the root of the word 
    stemmed = [porter.stem(word) for word in words]
    
    # Lematize the word
    lematized = [lemmatizer.lemmatize(word) for word in stemmed]

    return " ".join(lematized)

In [None]:
df_train['preprocess_text'] = df_train.excerpt.apply(preprocess_text)
df_test['preprocess_text'] = df_test.excerpt.apply(preprocess_text)

In [None]:
df_train.head()

# TF-IDF 

The idea of TF-IDF (term frequency-inverse document frequency) is to describe a document by his vocabulary. For exemple, we can assume that the more complexe vocabulary we use, the more complex the document could be. In TF-IDF, we found : 

- Terme frequency : given a document we compute the number of occurence of the word.
- Inverse document frequency : indicates how common or rare a word is in the entire document set. The close it is to 0, the more common a word is. It can be compute by taking the number of documents, dividing the number of doucments that contain a word, and calculating the logarithm. So if the word is very common in each document, we don't really want to keep it. 

We get a TF-IDF score by multiplying these two results.

* https://monkeylearn.com/blog/what-is-tf-idf/

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.model_selection import train_test_split

X_all = pd.concat([df_train["preprocess_text"], df_test["preprocess_text"]])

tfidf = TfidfVectorizer(stop_words = 'english')
tfidf.fit(X_all)

X = tfidf.transform(df_train["preprocess_text"])
X_test = tfidf.transform(df_test["preprocess_text"])


X_train, X_val, y_train, y_val = train_test_split(X, df_train["target"], test_size=0.1, random_state=42)

## SVM - Hyperparameters of SVR 

In this approach, we are going to use an SVM. In order to tuned that model, we need to understand his hyperparameter.

> Note : For a regression, we use SVR, but for a classification, we have to use SVC !

- C parameter adds a penalty for each misclassified data point. If C is small, the penalty for misclassified points is low so a decision boundary with a large margin is chosen at the expense of a greater number of misclassifications.

- Gamma controls the distance of influence of a single training point. Low values indicates a large similarity radius, which results in more points being grouped together. And, for high values of gamma, the points need to be very close to each other to be considered n the same group. 

- Espilon defines a margin of tolerance where no penalty is given to errors.


0.1 < C < 100

0.0001 < gamma < 10

* https://towardsdatascience.com/hyperparameter-tuning-for-support-vector-machines-c-and-gamma-parameters-6a5097416167
* https://stats.stackexchange.com/questions/259018/meaning-of-epsilon-in-svm-regression

In [None]:
from sklearn.svm import SVR

# If you want to search the best hyperparameter, change this variable.
FULL_PARAMETER_SEARCH = False

# At the moment, best search obtained from full parameter search : 
# {'C': 100, 'epsilon': 0.1, 'gamma': 0.005, 'kernel': 'rbf'}

if FULL_PARAMETER_SEARCH : 
    parameters = { 
        'kernel': ['rbf'], 
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.0001, 0.001, 0.01, 0.1],
        'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
    }
else:
    parameters = { 
        'kernel': ['rbf'], 
        'C': [100],
        'epsilon': [0.1],
        'gamma': [0.005]
    }

In [None]:
# We use our training set and validation.
if FULL_PARAMETER_SEARCH :
    model = GridSearchCV(
        SVR(), 
        parameters,
        cv=5, 
        scoring='neg_mean_squared_error',
        n_jobs=-1, 
        verbose=1).fit(X_train, y_train)
    
    # See on our validation data our score.
    y_val_pred = model.predict(X_val)
    print("Error on validation set : ", mean_squared_error(y_val, y_val_pred))
    
else :
    # Train on all the data 
    model = GridSearchCV(
        SVR(), 
        parameters,
        cv=5, 
        scoring='neg_mean_squared_error',
        n_jobs=-1, 
        verbose=1).fit(X, df_train["target"]) 
    
print(model.cv_results_['params'][model.best_index_])

In [None]:
y_pred = model.predict(X_test)

df_sample['target'] = y_pred

df_sample.to_csv("submission.csv", index=False)

## Bayesian Ridge model

With our TF-IDF representation, we can use other model instead of SVM. We can use Bayesian Ridge.

In [None]:
from sklearn.linear_model import BayesianRidge

clf = BayesianRidge()
clf.fit(X_train.toarray(), y_train)

# See on our validation data our score.
y_val_pred = clf.predict(X_val)

print("Error on validation : ", mean_squared_error(y_val, y_val_pred))

In [None]:
df_sample_bay = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

# Train on all our data
clf = BayesianRidge()
clf.fit(X.toarray(), df_train["target"])

# Make the prediction and save the file.
y_pred = clf.predict(X_test)

df_sample_bay['target'] = y_pred

df_sample_bay.to_csv("submission_bayes.csv", index=False)