In [None]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize 
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

result = pd.DataFrame()
result['id'] = test_df['id']

train_df = train_df[['excerpt', 'target']]
test_df = test_df[['excerpt']]

In [None]:
def preprocessing(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    # tokenize into words
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    # remove Punctuation
    interpunctuations = ['``', ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']   
    tokens = [token for token in tokens if token not in interpunctuations]
#     # remove stopwords
#     stop = stopwords.words('english')
#     tokens = [token for token in tokens if token not in stop]
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    # lemmatize
    lmtzr = nltk.WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)
    return preprocessed_text

train_df['excerpt'] = train_df['excerpt'].apply(preprocessing)
test_df['excerpt'] = test_df['excerpt'].apply(preprocessing)

In [None]:
train_df.iloc[1,0]

In [None]:
train_text = train_df[['excerpt']]
test_text = test_df[['excerpt']]
all_text = pd.concat([train_text, test_text])

In [None]:
word_vectorizer = TfidfVectorizer()
word_vectorizer.fit(all_text['excerpt'])
train_word_features = word_vectorizer.transform(train_df['excerpt'])
test_word_features = word_vectorizer.transform(test_df['excerpt'])

In [None]:
X_train = train_word_features
y_train = train_df['target']
X_test = test_word_features

In [None]:
xgb_final = XGBRegressor(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'reg:squarederror',
        seed=10)

xgb_final.fit(X_train, y_train)
predictions = xgb_final.predict(X_test)
result['target'] = predictions

In [None]:
result.head()

In [None]:
result.to_csv("/kaggle/working/submission.csv", index=False)