In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer

In [None]:
test=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
train=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
ss=pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
test_ids=test['id']

In [None]:
def count_words(x):
    return(len(x.split()))

def count_sentences(x):
    return(len(x.split('.')))

def remove_punctuation(x):
    new=''.join([i for i in x if i not in string.punctuation])
    return new

def preprocessing(x):
    stopw=stopwords.words('english')
    x=' '.join([word for word in x.split() if word not in stopw])
    
    x=remove_punctuation(x)
    x=x.lower()
    
    wnl=WordNetLemmatizer()
    x=' '.join([wnl.lemmatize(word,wordnet.ADJ) for word in x.split()])
    x=' '.join([wnl.lemmatize(word,'n') for word in x.split()])
    x=' '.join([wnl.lemmatize(word,'v') for word in x.split()])
    x=' '.join([wnl.lemmatize(word,'r') for word in x.split()])
    
    x=' '.join([word for word in x.split() if word not in stopw])
    
    return x

def word_freq(arr):
    d={}
    for instance in arr:
        words=instance.split()
        for word in words:
            d[word]=d.get(word,0)+1

    return d

def less_words(x):
    ll=1
    count=0
    for word in x.split():
        if frequency[word]<=ll:
            count+=1
    return count

def filter_words(x):
    words=x.split()
    ul=250
    lis=[]
    for word in words:
        if word in frequency:
            if frequency[word]<ul:
                lis.append(word)
    return ' '.join(words)
        

In [None]:
data=pd.concat([train,test],ignore_index=True)
data.url_legal=data.url_legal.apply(str)
data.license=data.license.apply(str)

data.loc[data.url_legal!=str(np.nan),'url_legal']=1
data.loc[data.url_legal==str(np.nan),'url_legal']=0
data.loc[data.license!=str(np.nan),'license']=1
data.loc[data.license==str(np.nan),'license']=0

data['num_words']=data['excerpt'].copy().apply(count_words)
data['num_sentences']=data['excerpt'].copy().apply(count_sentences)
data['ratio']=data.num_words/data.num_sentences

data.excerpt=data.excerpt.apply(preprocessing)

data['num_words_clr']=data['excerpt'].copy().apply(count_words)



In [None]:
passages=list(data['excerpt'].copy())
frequency=(word_freq(passages))

data['low_freq_words']=data['excerpt'].copy().apply(less_words)
data['excerpt']=data.excerpt.apply(filter_words)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(max_features=15000)
cv.fit(train.excerpt)

In [None]:
X=cv.transform(data.excerpt)
X=pd.DataFrame(X.toarray())
cols=['url_legal','license','num_words','num_sentences','num_words_clr','low_freq_words']
for col in cols:
    X[col]=data[col].copy()
X.shape

In [None]:
train=X.iloc[:train.shape[0],:]
test=X.iloc[train.shape[0]:,:]
y=data['target'].copy()
y=y[:train.shape[0]]

In [None]:
print('Train shape',train.shape)
print('Test shape',test.shape)
print('y shape',y.shape)

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error as mse

In [None]:
dor={'linear_regression':LinearRegression(),'randomforest':RandomForestRegressor(),'knn':KNeighborsRegressor(), 'xgb':xgb.XGBRegressor()}

In [None]:
def training(dor,X,y):
    X_train,X_test, y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
    for regressor_name in dor:
        regressor=dor[regressor_name]
        regressor.fit(X_train,y_train)
        print(regressor_name,'rmse',mse(y_test,regressor.predict(X_test)))

In [None]:
regressor=xgb.XGBRegressor()
regressor.fit(train.drop(['url_legal','license'],axis=1),y)

In [None]:
ans=pd.DataFrame()
cols=ss.columns
ans[cols[0]]=test_ids
ans[cols[1]]=regressor.predict(test.drop(['url_legal','license'],axis=1))
ans.to_csv('./submission.csv')