In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')

In [None]:
df.head(5)

In [None]:
df.describe()

In [None]:
df.isnull().sum()

We will drop the first three columns

In [None]:
df = df.drop(['id', 'url_legal', 'license'], axis = 1)

In [None]:
df.head()

In [None]:
df['excerpt'][0]

In [None]:
# remove the numbers
# tokenise the excerpt 
# remove the stopwords
# remove the contractions
# lemitization

In [None]:
# remove the numbers
df['excerpt'] = df['excerpt'].apply(lambda x: x.replace('\d+', ''))

In [None]:
df['excerpt'][0]

In [None]:
# tokenise the excerpt
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer(r'\w+')

In [None]:
df['excerpt']=df['excerpt'].apply(lambda x:tokenizer.tokenize(x.lower()))

In [None]:
#remove the stopwords
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words  = set(stopwords.words('english'))

In [None]:
df['excerpt'] = df['excerpt'].apply(lambda x:[item for item in x if item not in stop_words])

In [None]:
df['excerpt'][0]

In [None]:
# remove the contractions
!pip install contractions
import contractions

In [None]:
df['excerpt'] = df['excerpt'].apply(lambda x: [contractions.fix(word) for word in x])

In [None]:
df['excerpt'][0]

In [None]:
# lemitization

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [None]:
def word_lemmatizer(text):
    lem_text=' '.join([lemmatizer.lemmatize(i) for i in text])
    return lem_text

In [None]:
df['excerpt']=df['excerpt'].apply(lambda x:word_lemmatizer(x))

In [None]:
df.head(5)

In [None]:
x_train = df['excerpt']
y_train = df['target']

In [None]:
print(x_train.shape)
y_train.shape

In [None]:
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')

In [None]:
test_df.head()

In [None]:
x_test = test_df['excerpt']

In [None]:
# Linear Regreesion
# Bayesian Regression
# Support Vector Machine
# Nearest neighbour regression
# Decision Tree Regressor

In [None]:
import joblib

In [None]:
# Linear Regreesion
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline_1 = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.85, norm='l2')),
    ('ligreg', linear_model.LinearRegression()),
])

In [None]:
pipeline_1.fit(x_train, y_train)

In [None]:
#save the pipeline
filename = 'submission.csv'
joblib.dump(pipeline_1, filename)

In [None]:
model_1 = joblib.load('submission.csv')
y_pred = model_1.predict(x_test)
print(y_pred)

In [None]:
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [None]:
# Bayesian Regression
from sklearn.linear_model import BayesianRidge
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline_2 = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.85, norm='l2')),
    ('to_dense', DenseTransformer()),
    ('basreg', linear_model.BayesianRidge()),
])

In [None]:
pipeline_2.fit(x_train, y_train)

In [None]:
#save the pipeline
filename = 'pipeline_2.sav'
joblib.dump(pipeline_2, filename)

In [None]:
model_2 = joblib.load('pipeline_2.sav')
y_pred_2 = model_2.predict(x_test)
print(y_pred_2)

In [None]:
# Support Vector Machine
from sklearn import svm
from sklearn.svm import LinearSVR
pipeline_3 = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.85, norm='l2')),
    ('LinearSvr', svm.LinearSVR()),
])

In [None]:
pipeline_3.fit(x_train, y_train)

In [None]:
#save the pipeline
filename = 'pipeline_3.sav'
joblib.dump(pipeline_3, filename)

In [None]:
model_3 = joblib.load('pipeline_3.sav')
y_pred_3 = model_3.predict(x_test)
print(y_pred_3)

In [None]:
# Nearest neighbour regression
from sklearn import neighbors
from sklearn.neighbors import KNeighborsRegressor
pipeline_4 = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.85, norm='l2')),
    ('KNN', neighbors.KNeighborsRegressor()),
])

In [None]:
pipeline_4.fit(x_train, y_train)

In [None]:
#save the pipeline
filename = 'pipeline_4.sav'
joblib.dump(pipeline_4, filename)

In [None]:
model_4 = joblib.load('pipeline_4.sav')
y_pred_4 = model_4.predict(x_test)
print(y_pred_4)

In [None]:
# Decision Tree Regressor
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
pipeline_5 = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.85, norm='l2')),
    ('DTR', tree.DecisionTreeRegressor()),
])

In [None]:
pipeline_5.fit(x_train, y_train)

In [None]:
#save the pipeline
filename = 'pipeline_5.sav'
joblib.dump(pipeline_5, filename)

In [None]:
model_5 = joblib.load('pipeline_5.sav')
y_pred_5 = model_5.predict(x_test)
print(y_pred_5)