# importing libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
from math import sqrt

# for ignoring warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Read Data

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df.head()

# Text Preprocessing

In [None]:
import re
import string
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


def preprocessing_text(text):
    text = text.lower()
    text = re.sub(r'd+','', text)
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # removing spaces
    text = text.strip()
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    text = ' '.join(str(elem) for elem in filtered_text)
    # steaming
    stemmer = PorterStemmer()
    token_text = word_tokenize(text)
    for word in token_text:
        # print(stemmer.stem(word))
        text = text +' '+stemmer.stem(word)
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    input_text = word_tokenize(text)
    for word in input_text:
        text = text + '' + lemmatizer.lemmatize(word)
    return text

## Apply preprocessing on dataset

In [None]:
df['process_text'] = ''
for i in range(0,len(df)):
    df['process_text'][i] = preprocessing_text(df.excerpt[i])

In [None]:
df.head()

## seperate dependent and independent variables

In [None]:
X = df.process_text
y = df.target

# splitting dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, shuffle=False)

# Apply TF-IDF

In [None]:
# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer() 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
tfidf_train.shape, X_train.shape, tfidf_test.shape, X_test.shape

# Models

### Linear Regression

In [None]:
reg = LinearRegression().fit(tfidf_train, y_train)
y_pred = reg.predict(tfidf_test)

### Checking Root mean square

In [None]:
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('Root mean Square is :',rmse)

### Random Forest

In [None]:
rf_reg = RandomForestRegressor()
rf_reg.fit(tfidf_train, y_train)
y_pred = rf_reg.predict(tfidf_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('Root mean Square is :',rmse)

### Decision Tree

In [None]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(tfidf_train, y_train)
y_pred = dt_reg.predict(tfidf_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('Root mean Square is :',rmse)

### XGBoost

In [None]:
xgb_clf = xgb.XGBRegressor()
xgb_clf.fit(tfidf_train, y_train)
y_pred = xgb_clf.predict(tfidf_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('Root mean Square is :',rmse)

# Submitting Score

In [None]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test['process_text'] = ''
for i in range(0,len(test)):
    test['process_text'][i] = preprocessing_text(test.excerpt[i])

In [None]:
# Applyinh TF-IDF on test dataset
tX = test.process_text
test_tfidf = tfidf_vectorizer.transform(tX) 

In [None]:
#checking shape same or not
test_tfidf.shape, tfidf_train.shape

### Predicting

In [None]:
# Prediction on test
y_pred = reg.predict(test_tfidf)

### Creating submission.csv file 

In [None]:
a = pd.DataFrame()
a['id'] = test['id']
a['target'] = y_pred
a.reset_index(inplace=True,drop=True)
a.to_csv('submission.csv',index=False)  #file save as submission.csv

In [None]:
a