## CommonLit Readability: simple preprocessing and models to start with

In [None]:
# import libraries
import gzip
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#!kaggle competitions download -c commonlitreadabilityprize

In [None]:
# get the train data
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train.head()

In this notebook we will use only excerpt (X) and target (y) columns. So at this stage we don't need other columns.

In [None]:
# split the train data into the X and y
X = train['excerpt']
y = train['target']

In [None]:
# get the test data
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test.head()

In [None]:
# split the data into the train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# create a list to collect the scores
scores = []

**For all models we will use TfidfVectorizer(), algorithm that convert a collection of text documents to a matrix of TF-IDF features. For this case we are interested in words that occur in at least 3 documents.**

## Classifier #1 
## with TfidfVectorizer and LinearRegression

In [None]:
# define the classifier
classifier_1 = Pipeline([
    ("vect", TfidfVectorizer(min_df=3)),
    ("logreg", LinearRegression())
])

In [None]:
# fit the data
classifier_1.fit(X_train, y_train)

In [None]:
# get the score on test data
score_1 = classifier_1.score(X_test, y_test)
scores.append(score_1)
score_1

## Classifier #2 
## with TfidfVectorizer (+ n-grams) and LinearRegression

In [None]:
# define the classifier
classifier_2 = Pipeline([
    ("vect", TfidfVectorizer(min_df=3, ngram_range = (1, 2))),
    ("logreg", LinearRegression())
])

In [None]:
# fit the data
classifier_2.fit(X_train, y_train)

In [None]:
# get the score on test data
score_2 = classifier_2.score(X_test, y_test)
scores.append(score_2)
score_2

## Classifier #3
## with TfidfVectorizer and Ridge Regression

In [None]:
# define the classifier
classifier_3 = Pipeline([
    ("vect", TfidfVectorizer(min_df=3)),
    ("ridgereg", Ridge())
])

In [None]:
# fit the data
classifier_3.fit(X_train, y_train)

In [None]:
# get the score on test data
score_3 = classifier_3.score(X_test, y_test)
scores.append(score_3)
score_3

## Classifier #4
## with TfidfVectorizer and AdaBoostRegressor

In [None]:
# define the classifier
classifier_4 = Pipeline([
    ("vect", TfidfVectorizer(min_df=3)),
    ("ridgereg", AdaBoostRegressor(Ridge()))
])

In [None]:
# fit the data
classifier_4.fit(X_train, y_train)

In [None]:
# get the score on test data
score_4 = classifier_4.score(X_test, y_test)
scores.append(score_4)
score_4

## Classifier #5
## with TfidfVectorizer and Ridge Regression (with alpha 0.95)

In [None]:
# define the classifier
classifier_5 = Pipeline([
    ("vect", TfidfVectorizer(min_df=3)),
    ("ridgereg", Ridge(alpha = 0.95))
])

In [None]:
# fit the data
classifier_5.fit(X_train, y_train)

In [None]:
# get the score on test data
score_5 = classifier_5.score(X_test, y_test)
scores.append(score_5)
score_5

## Classifier #6
## with TfidfVectorizer and RandomForest

In [None]:
# define the classifier
classifier_6 = Pipeline([
    ("vect", TfidfVectorizer(min_df=3)),
    ("randomforest", RandomForestRegressor())
])

In [None]:
# fit the data
classifier_6.fit(X_train, y_train)

In [None]:
# get the score on test data
score_6 = classifier_6.score(X_test, y_test)
scores.append(score_6)
score_6

## Classifier #7
## with TfidfVectorizer and XGBRegressor

In [None]:
# define the classifier
classifier_7 = Pipeline([
    ("vect", TfidfVectorizer(min_df=3)),
    ("XGBRegressor", XGBRegressor(n_estimators = 500, learning_rate=0.05))
])

In [None]:
# fit the data
classifier_7.fit(X_train, y_train)

In [None]:
# get the score on test data
score_7 = classifier_7.score(X_test, y_test)
scores.append(score_7)
score_7

## Final results and model selection

In [None]:
# take a look at all results in one place
algorithms = ['LinearRegression','LinearRegression (n-grams)','Ridge Regression','AdaBoostRegressor','Ridge Regression (with alpha 0.95)','RandomForest','XGBRegressor']
scores_table = pd.DataFrame({'Algorithm' : algorithms, 'Score' : scores})
scores_table

In [None]:
# predict with Ridge Regression (0.95)
pred_test = classifier_5.predict(test['excerpt'])

In [None]:
# create submission file
submission = pd.DataFrame({'id' : test['id'], 'target' : pred_test})
submission.to_csv('/kaggle/working/submission.csv', index=False)
submission.head()