In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
from math import sqrt

# for ignoring warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
sample_submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


def preprocessing_text(text):
    text = text.lower()
    text = re.sub(r'd+','', text)
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # removing spaces
    text = text.strip()
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    text = ' '.join(str(elem) for elem in filtered_text)
    # steaming
    stemmer = PorterStemmer()
    token_text = word_tokenize(text)
    for word in token_text:
        # print(stemmer.stem(word))
        text = text +' '+stemmer.stem(word)
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    input_text = word_tokenize(text)
    for word in input_text:
        text = text + '' + lemmatizer.lemmatize(word)
    return text

In [None]:
train_df['process_text'] = ''
for i in range(0,len(train_df)):
    train_df['process_text'][i] = preprocessing_text(train_df.excerpt[i])

In [None]:
train_df.head()

In [None]:
X = train_df.process_text
y = train_df.target

# splitting dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, shuffle=False)

In [None]:
# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer() 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
tfidf_train.shape, X_train.shape, tfidf_test.shape, X_test.shape

In [None]:
reg = LinearRegression().fit(tfidf_train, y_train)
y_pred = reg.predict(tfidf_test)

In [None]:
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('Root mean Square is :',rmse)

In [None]:
test_df['process_text'] = ''
for i in range(0,len(test_df)):
    test_df['process_text'][i] = preprocessing_text(test_df.excerpt[i])

In [None]:
tX = test_df.process_text
test_tfidf = tfidf_vectorizer.transform(tX) 
test_tfidf.shape, tfidf_train.shape

In [None]:
y_pred = reg.predict(test_tfidf)

In [None]:
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['target'] = y_pred
submission.reset_index(inplace=True,drop=True)
submission.to_csv('submission.csv',index=False) 