**Libraries:** 

**Exploratory Data Analysis** \
I used the following libraries for exploratory data analysis: \
matplotlib \
seaborn \
collections (Counter) \
plotly.express \
scipy.stats 

**Models Used** \
I decided to run 7 different regression models: \
Random Forest Regression \
Gradient Boosting Regression \
Support Vector Regression \
AdaBoost Regression \
XGBoost Regression \
Ridge Regression \
Linear Regression 


In [None]:
#Obtain Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
import plotly.express as px

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

import xgboost as xgb

import scipy.stats as st
from statistics import mean

from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag

import re


In [None]:
train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")

# Remove incomplete entries
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
train_df.loc[train_df['id'] == 'c12129c31']

**Reading in the Data** \
The training and test data sets are read in. \
The submission file for reference is also read in.

**Training Dataset** \
For the training dataset, I removed any entries that contained a target and standard error value of 0. \
After the entries are removed, I take a look at the train data.



In [None]:
#====== Preprocessing function ======
def preprocess(data):
    excerpt_processed=[]
    for e in data['excerpt']:
        
        # find alphabets
        e = re.sub("[^a-zA-Z]", " ", e)
        
        # convert to lower case
        e = e.lower()
        
        # tokenize words
        e = nltk.word_tokenize(e)
        
        # remove stopwords
        e = [word for word in e if not word in set(stopwords.words("english"))]
        
        # lemmatization
        lemma = nltk.WordNetLemmatizer()
        e = [lemma.lemmatize(word) for word in e]
        e=" ".join(e)
        
        excerpt_processed.append(e)
        
    return excerpt_processed 

**Preprocessing the Data**

To preprocess the train and test data, we take a look at each entry in the excerpt variable and remove foreign texts, unnecessary blanks, convert the text to lowercase, tokenize the words using nltk, remove any non-english verbiage, and use a word lemmatizer.

The variable then becomes defined as "excerpt_preprocessed", which indicates the newly cleaned excerpt variable which we will be working for our regressions.

In [None]:
train_df["excerpt_preprocessed"] = preprocess(train_df)
test_df["excerpt_preprocessed"] = preprocess(test_df)

In [None]:
#Excerpt
print(train_df.iloc[0,3])

In [None]:
#Exerpt Preprocessed
print(train_df.iloc[0,6])

**Exploratory Data Analysis** \
We explore the following variables: \
Target \
Standard Error \
Excerpt 

In [None]:
#View target in depth
#Title#
print("target Variable")
print("----------")

#Mean#
target_mean = train_df["target"].mean()
print(f"Mean: {target_mean}")

#Median#
target_median = train_df["target"].median()
print(f"Median: {target_median}")

#Standard Deviation
target_std = train_df["target"].std()
print(f"Standard Deviation: {target_std}")

#Minimum Value
target_min = train_df["target"].min()
print(f"Minimum Value: {target_min}")

#25th Percentile
target_25 = np.percentile(train_df["target"],25)
print(f"25th Percentile: {target_25}")

#50th Percentile
target_50 = np.percentile(train_df["target"],50)
print(f"50th Percentile: {target_50}")

#75th Percentile
target_75 = np.percentile(train_df["target"],75)
print(f"75th Percentile: {target_75}")

#Maximum Value
target_max = train_df["target"].max()
print(f"Maximum Value: {target_max}")

#Skew
target_skew = train_df["target"].skew(axis= 0, skipna = True)
print(f"Skew: {target_skew}")

#Plot 'target' variable

plt.hist(train_df['target'],edgecolor = 'black', bins=50, density=True)
mn, mx = plt.xlim()
plt.xlim(mn, mx)
kde_xs = np.linspace(mn, mx, 300)
kde = st.gaussian_kde(train_df['target'])
plt.plot(kde_xs, kde.pdf(kde_xs), label="PDF")
plt.ylabel('Frequency')
plt.xlabel('Target')
plt.title("Target Distribution");

**Target Variable** \
Of the values mentioned, the most intriguing values are the minimum and maximum values in the target variable, which are -3.676267773 and 1.711389827 respectively.

The mean value of the target variable is -0.9596573929279933, which indicates that on average most text is on the difficult end.

Afterwards, the distribution is plotted, showing a bell curve.

In [None]:
#View standard_error in depth
#Title#
print("standard_error Variable")
print("----------")

#Mean#
standard_error_mean = train_df["standard_error"].mean()
print(f"Mean: {standard_error_mean}")

#Median#
standard_error_median = train_df["standard_error"].median()
print(f"Median: {standard_error_median}")

#Standard Deviation
standard_error_std = train_df["standard_error"].std()
print(f"Standard Deviation: {standard_error_std}")

#Minimum Value
standard_error_min = train_df["standard_error"].min()
print(f"Minimum Value: {standard_error_min}")

#25th Percentile
standard_error_25 = np.percentile(train_df["standard_error"],25)
print(f"25th Percentile: {standard_error_25}")

#50th Percentile
standard_error_50 = np.percentile(train_df["standard_error"],50)
print(f"50th Percentile: {standard_error_50}")

#75th Percentile
standard_error_75 = np.percentile(train_df["standard_error"],75)
print(f"75th Percentile: {standard_error_75}")

#Maximum Value
standard_error_max = train_df["standard_error"].max()
print(f"Maximum Value: {standard_error_max}")

#Skew
standard_error_skew = train_df["target"].skew(axis= 0, skipna = True)
print(f"Skew: {standard_error_skew}")

#Plot 'standard_error' variable

plt.hist(train_df['standard_error'],edgecolor = 'black', bins=50, density=True)
mn, mx = plt.xlim()
plt.xlim(mn, mx)
kde_xs = np.linspace(mn, mx, 300)
kde = st.gaussian_kde(train_df['standard_error'])
plt.plot(kde_xs, kde.pdf(kde_xs), label="PDF")
plt.ylabel('Frequency')
plt.xlabel('Standard Error')
plt.title("Standard Error Distribution");

In [None]:
def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower().split()
    return [word for word in text if word not in stopwords.words('english')]

train_df['temp'] = train_df["excerpt"].apply(lambda x : clean_text(x))

top = Counter([word for words in train_df['temp'] for word in words])

wordlist = pd.DataFrame(top.most_common(20),columns = ['Word','Frequency'])

#Bar Chart For Word Frequency
wordlist.plot.bar(x='Word',y='Frequency')
plt.ylabel('Frequency')
plt.xlabel('Word')
plt.title("Top 20 Words Frequency Distribution - Excerpt");

In [None]:
def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower().split()
    return [word for word in text if word not in stopwords.words('english')]

train_df['temp'] = train_df["excerpt_preprocessed"].apply(lambda x : clean_text(x))

top = Counter([word for words in train_df['temp'] for word in words])

wordlist = pd.DataFrame(top.most_common(20),columns = ['Word','Frequency'])

#Bar Chart For Word Frequency
wordlist.plot.bar(x='Word',y='Frequency', color='purple')
plt.ylabel('Frequency')
plt.xlabel('Word')
plt.title("Top 20 Words Frequency Distribution - Excerpt Preprocessed");

**Excerpt**

In [None]:
def avg_word_len(text):
    avg_len = text.str.split().apply(lambda x : [len(i) for i in x]).map(lambda x: np.mean(x))
    return avg_len

fig, ax = plt.subplots(4, 1, figsize=(10,15))

train_df['text_len'] = train_df['excerpt'].str.split().map(lambda x: len(x))
sns.scatterplot(x='text_len', y='target', data=train_df, color='blue', ax=ax[0])
ax[0].set_title("Word Count vs Target - Excerpt", fontweight ="bold")

avg_len = avg_word_len(train_df['excerpt'])
train_df['avg_word_len'] = avg_len
sns.scatterplot(x='avg_word_len', y='target', data=train_df, color='blue', ax=ax[1])
ax[1].set_title("Average Word Length vs Target - Excerpt", fontweight ="bold")

train_df['no_sents'] = train_df['excerpt'].apply(lambda x : len(x.split('\n')))
sns.scatterplot(x='no_sents', y='target', data=train_df, color='blue', ax=ax[2])
ax[2].set_title("Sentence Count vs Target - Excerpt", fontweight ="bold")

train_df['chr_len'] = train_df['excerpt'].str.len()
sns.scatterplot(x='chr_len', y='target', data=train_df, color='blue', ax=ax[3])
ax[3].set_title("Character Count vs Target - Excerpt", fontweight ="bold")

plt.subplots_adjust(hspace=0.35)

plt.show()

In [None]:
def min_max_mean_sentence_length(text):

    tokened_sent = sent_tokenize(text)
    main_dict = {}
    for item in tokened_sent:
        item1 = list(item.split(" "))
        item2 = [' '.join(item1)]
        Length = []
        Length.append(len(item1))
        mydict = dict(zip(item2, Length))
        main_dict.update(mydict)

    return max(main_dict.values()), min(main_dict.values()), round(mean(main_dict.values()),3)

train_df[['max_len_sent','min_len_sent','avg_len_sent']] = train_df.apply(lambda x: min_max_mean_sentence_length(x['excerpt']),axis=1, result_type='expand')

fig, ax = plt.subplots(3, 1, figsize=(10,15))
sns.scatterplot(x='max_len_sent', y='target', data=train_df, color='blue', ax=ax[0])
ax[0].set_title("Maximum Sentence Length in Excerpt vs Target - Excerpt", fontweight ="bold")

sns.scatterplot(x='min_len_sent', y='target', data=train_df, color='blue', ax=ax[1])
ax[1].set_title("Minimum Sentence Length in Excerpt vs Target - Excerpt", fontweight ="bold")

sns.scatterplot(x='avg_len_sent', y='target', data=train_df, color='blue', ax=ax[2])
ax[2].set_title("Average Sentence Length in Excerpt vs Target - Excerpt", fontweight ="bold")

**Excerpt Preprocessed**

In [None]:
def avg_word_len(text):
    avg_len = text.str.split().apply(lambda x : [len(i) for i in x]).map(lambda x: np.mean(x))
    return avg_len

fig, ax = plt.subplots(4, 1, figsize=(10,15))

train_df['text_len_pre'] = train_df['excerpt_preprocessed'].str.split().map(lambda x: len(x))
sns.scatterplot(x='text_len_pre', y='target', data=train_df, color='purple', ax=ax[0])
ax[0].set_title("Word Count vs Target - Excerpt Preprocessed", fontweight ="bold")

avg_len_pre = avg_word_len(train_df['excerpt_preprocessed'])
train_df['avg_word_len_pre'] = avg_len_pre
sns.scatterplot(x='avg_word_len_pre', y='target', data=train_df, color='purple', ax=ax[1])
ax[1].set_title("Average Word Length vs Target - Excerpt Preprocessed", fontweight ="bold")

train_df['no_sents_pre'] = train_df['excerpt_preprocessed'].apply(lambda x : len(x.split('\n')))
sns.scatterplot(x='no_sents_pre', y='target', data=train_df, color='purple', ax=ax[2])
ax[2].set_title("Sentence Count vs Target - Excerpt Preprocessed", fontweight ="bold")

train_df['chr_len_pre'] = train_df['excerpt_preprocessed'].str.len()
sns.scatterplot(x='chr_len_pre', y='target', data=train_df, color='purple', ax=ax[3])
ax[3].set_title("Character Count vs Target - Excerpt Preprocessed", fontweight ="bold")

plt.subplots_adjust(hspace=0.35)

plt.show()

**Combined**

In [None]:
def avg_word_len(text):
    avg_len = text.str.split().apply(lambda x : [len(i) for i in x]).map(lambda x: np.mean(x))
    return avg_len

fig, ax = plt.subplots(4, 1, figsize=(10,15))

train_df['text_len'] = train_df['excerpt'].str.split().map(lambda x: len(x))
train_df['text_len_pre'] = train_df['excerpt_preprocessed'].str.split().map(lambda x: len(x))
sns.scatterplot(x='text_len', y='target', data=train_df, color='blue', ax=ax[0])
sns.scatterplot(x='text_len_pre', y='target', data=train_df, color='purple', ax=ax[0])
ax[0].set_title("Word Count vs Target - Combined", fontweight ="bold")

avg_len = avg_word_len(train_df['excerpt'])
avg_len_pre = avg_word_len(train_df['excerpt_preprocessed'])
train_df['avg_word_len'] = avg_len
train_df['avg_word_len_pre'] = avg_len_pre
sns.scatterplot(x='avg_word_len', y='target', data=train_df, color='blue', ax=ax[1])
sns.scatterplot(x='avg_word_len_pre', y='target', data=train_df, color='purple', ax=ax[1])
ax[1].set_title("Average Word Length vs Target - Combined", fontweight ="bold")

train_df['no_sents'] = train_df['excerpt'].apply(lambda x : len(x.split('\n')))
train_df['no_sents_pre'] = train_df['excerpt_preprocessed'].apply(lambda x : len(x.split('\n')))
sns.scatterplot(x='no_sents', y='target', data=train_df, color='blue', ax=ax[2])
sns.scatterplot(x='no_sents_pre', y='target', data=train_df, color='purple', ax=ax[2])
ax[2].set_title("Sentence Count vs Target - Combined", fontweight ="bold")

train_df['chr_len'] = train_df['excerpt'].str.len()
train_df['chr_len_pre'] = train_df['excerpt_preprocessed'].str.len()
sns.scatterplot(x='chr_len', y='target', data=train_df, color='blue', ax=ax[3])
sns.scatterplot(x='chr_len_pre', y='target', data=train_df, color='purple', ax=ax[3])
ax[3].set_title("Character Count vs Target - Combined", fontweight ="bold")

plt.subplots_adjust(hspace=0.35)

plt.show()

**Exploring Excerpt Data** \
Working with the excerpts, we look at the following and graph it: \
Word Count = Total number of words in the excerpt \
Average Word Length = Average character length per word in excerpt \
Sentence Count = Total number of sentences in excerpt \
Character Length = Total number of characters in excerpt \
Maximum Sentence Length = Maximum length of sentence in excerpt \
Minimum Sentence Length = Minimum length of sentence in excerpt \
Average Sentence Length = Average length of sentence in excerpt

Exploring the excerpt, we look at excerpt and the preprocessed version of excerpt.  \
In my exploratory data analysis of excerpt, I graphed both excerpt and the preprocessed version of excerpt for each of the categories for excerpt for comparison purposes. \
We want to know how each of the categories affects the target variable. Looking at the target variable, we also want to know how preprocessing the data changes the target variable's distribution in a scatterplot.


In [None]:
def training(model, X_train, y_train, X_test, y_test, model_name):
    
    model = make_pipeline(
        TfidfVectorizer(binary=True, ngram_range=(1,1)),
        model,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    MSE = mse(y_test, y_pred)
    
    print("Model:", model_name)
    print("Mean Squared Error:", MSE)
    
    plt.bar(model_name, MSE, align = 'center', alpha = 0.5)
    plt.xticks(rotation='vertical')
    plt.ylabel('Mean Squared Error')
    plt.xlabel('Regression Model')
    plt.title("Regression Model: Mean Squared Error");

rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()
svr = SVR()
abr = AdaBoostRegressor()
xg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)
ridge = Ridge()
lr = LinearRegression()
m = [rfr,gbr,svr,abr,xg,ridge,lr]
mn = ["Random Forest Regression","Gradient Boosting Regression","Support Vector Regression","AdaBoost Regression","XGBoost Regression","Ridge Regression","Linear Regression"]

X = train_df["excerpt_preprocessed"].values
y = train_df['target'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
for i in range(0,len(m)):
    training(model=m[i], X_train=X_train, y_train=y_train, X_test=X_test,y_test=y_test, model_name= mn[i])

**Training Model**

**Mean Squared Error** \
Based on the results, it would appear that Ridge Regression has the best results since it has the lowest mean squared error. \
XGBoost had very similar results regardless of when the n estimator is 100 or 1000.

In [None]:
def training_all(model,X,y):
    
    model = make_pipeline(
        TfidfVectorizer(binary=True, ngram_range=(1,1)),
        model,
    )
    model.fit(X, y)
    y_pred = model.predict(test_df["excerpt_preprocessed"])
    
    return y_pred

**Results**

In [None]:
#Random Forest Regression Results
test_pred = training_all(rfr,X,y)
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = test_pred
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions

In [None]:
#Gradient Boosting Regression Results
test_pred = training_all(gbr,X,y)
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = test_pred
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions

In [None]:
#Support Vector Regression Results
test_pred = training_all(svr,X,y)
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = test_pred
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions

In [None]:
#AdaBoost Regression Results
test_pred = training_all(abr,X,y)
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = test_pred
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions

In [None]:
#XGBoost Regression Results
test_pred = training_all(xg,X,y)
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = test_pred
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions

In [None]:
#Ridge Regression Results
test_pred = training_all(ridge,X,y)
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = test_pred
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions

In [None]:
#Linear Regression Results
test_pred = training_all(lr,X,y)
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = test_pred
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions