In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string
import xgboost
from xgboost import XGBRegressor

In [None]:
train_data_file = "../input/commonlitreadabilityprize/train.csv"
test_data_file = "../input/commonlitreadabilityprize/test.csv"
sample_submission_file = "../input/commonlitreadabilityprize/sample_submission.csv"

In [None]:
train_data = pd.read_csv(train_data_file)
test_data = pd.read_csv(test_data_file)

In [None]:
train_data

In [None]:
test_data

In [None]:
train_data['excerpt'][0]

In [None]:
def text_process(texts):
    stemmer = nltk.WordNetLemmatizer()
    stopwords_english = stopwords.words('english')
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    texts_clean = []
    for text in texts['excerpt']:
        text = re.sub(r'\n', '', text)
        text = re.sub('[^a-zA-Z]', ' ', text)
        text_token = tokenizer.tokenize(text)
        text_clean = []
        for word in text_token:
            if (word not in string.punctuation and word not in stopwords_english):
                stem_word = stemmer.lemmatize(word)
                text_clean.append(stem_word)
        str1 = ' '.join(text_clean)
        texts_clean.append(str1)
    
    texts['excerpt'] = np.array(texts_clean)

In [None]:
text_process(train_data)
text_process(test_data)

In [None]:
train_data['excerpt'][0]

In [None]:
test_data['excerpt'][0]

In [None]:
Y = train_data['target'].copy()
X = train_data.drop(['standard_error','target','id'], axis=1)
# X.fillna(-999)

In [None]:
tf = TfidfVectorizer(binary=True, ngram_range=(1,1))
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.4, random_state=1)
X_train = tf.fit_transform(X_train['excerpt'])
X_val =  tf.transform(X_val['excerpt'])

In [None]:
model = Ridge(alpha=0.5)
model.fit(X_train, Y_train)

In [None]:
print(mean_squared_error(model.predict(X_val),Y_val))

In [None]:
test_data = test_data.drop('id', axis=1)
# test_data.fillna(-999)
X_test = tf.transform(test_data['excerpt'])

In [None]:
predict = model.predict(X_test)

In [None]:
submission = pd.read_csv(sample_submission_file)
submission['target']=predict
submission

In [None]:
submission.to_csv('submission.csv', index=False)