In [None]:
import pandas as pd
import re
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df = train.copy()
df.info()

In [None]:
import missingno as msno
msno.matrix(df)
plt.show()

In [None]:
df = df.fillna('Missing')

In [None]:
df.url_legal = df.url_legal.apply(lambda x: str(x).replace('https://','').replace('http:','').replace('www',''))
df.url_legal = df.url_legal.apply(lambda x: str(x).replace('org','').replace('.io','').replace('.edu','').replace('.','') )
df.url_legal = df.url_legal.apply(lambda x: re.sub(r'/.*','', str(x)))

df.url_legal.unique()

In [None]:
df.info()

In [None]:
sns.countplot(x=df.url_legal)
plt.xticks(rotation=75)
plt.show()

In [None]:
df.license.unique()

In [None]:
sns.countplot(y=df.license)
plt.xticks(rotation=75)
plt.show()

In [None]:
df.license = df.license.apply(lambda x: str(x).replace('CC','').replace(' ','').replace('-','').replace(',',''))
df.license = df.license.apply(lambda x: re.sub(r'\d.\d','',str(x)))

In [None]:
sns.countplot(y=df.license)
plt.xticks(rotation=75)
plt.show()

In [None]:
def change_scale(old_value):
    return ( (old_value - -3.676267773) / (1.7113898269999999 - -3.676267773) ) * (5 - 1) + 1

df.target = df.target.apply(change_scale)

In [None]:
sns.histplot(df.target)
print(min(df.target), max(df.target))

In [None]:
df.target = df.target.apply(lambda x: np.floor(x))

In [None]:
sns.histplot(df.target)

In [None]:
sns.countplot(x=df.url_legal, hue=df.target)
plt.xticks(rotation=75)
plt.show()

In [None]:
sns.countplot(x=df.license, hue=df.target)
plt.xticks(rotation=75)
plt.show()

In [None]:
sns.boxplot(x=df.target)
plt.show()

In [None]:
df.target = df.target.apply(lambda x: 4 if (x==5) else x)

In [None]:
sns.boxplot(x=df.target)
plt.show()

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(1,2,1)
sns.countplot(x=df.url_legal, hue=df.target)
plt.xticks(rotation=75)

plt.subplot(1,2,2)
sns.countplot(x=df.license, hue=df.target)
plt.xticks(rotation=75)
plt.show()

In [None]:
df['ex_len'] = df.excerpt.apply(lambda x: len(x))

sns.histplot(x=df.ex_len)
plt.show()

In [None]:
sns.countplot(x=df.url_legal, hue=df.license)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.histplot(x=df.ex_len, hue=df.license, multiple='stack', element='step')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.histplot(x=df.ex_len, hue=df.url_legal, multiple='stack', element='step')
plt.show()

In [None]:
for i in df.license.unique():
    sns.countplot(x=df[df.license==i].url_legal)
plt.xticks(rotation=75)
plt.show()

In [None]:
len(df.license.unique())

In [None]:
rows, cols = 3, 3
f, axes = plt.subplots(nrows = rows, ncols=cols, figsize=(15,25))

counter=0
for i in range(rows):
    for j in range(cols):
        if counter < len(df.license.unique()):
            license = df.license.unique()[counter]
        sns.countplot(x=df[df.license==license].url_legal, ax=axes[i,j])
        axes[i,j].set_title(license)
#         axes[i,j].set_xticks(rotation = 75)
        plt.setp(axes[i,j].get_xticklabels(), rotation=45, horizontalalignment='right')
        counter+=1

plt.show()

In [None]:
df

In [None]:
sns.histplot(x=df.ex_len, hue=df.target, multiple='stack')

In [None]:
df.excerpt[0]

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')

def cleaner(excerpt):
    clean = nltk.word_tokenize(re.sub("[^a-zA-Z]", " ", excerpt).lower())
    clean = [word for word in clean if not word in set(stopwords.words("english"))]

    lem = nltk.WordNetLemmatizer()
    clean = [lem.lemmatize(word) for word in clean]
    return " ".join(clean)

df.excerpt = df.excerpt.apply(cleaner)

In [None]:
df.excerpt[0]

In [None]:
wdf = pd.read_csv('../input/english-word-frequency/unigram_freq.csv')
wdf.head()

In [None]:
wdf['ncol'] = wdf.word.apply(lambda x: True if (x not in set(stopwords.words("english"))) else False)

In [None]:
nwdf = wdf[wdf.ncol==True]

In [None]:
lem1 = nltk.WordNetLemmatizer()
nwdf['lword'] = nwdf.word.apply(lambda x: lem1.lemmatize(str(x)))

In [None]:
nwdf

In [None]:
len(nwdf.word.unique())

In [None]:
nwdf = nwdf.sort_values('count')

In [None]:
nwdf

In [None]:
# sns.histplot(nwdf['count'])

In [None]:
def change_scale_word_count(old_value):
    return ( (old_value - 12711) / (1551258643 - 12711) ) * (100000 - 1) + 1

In [None]:
nwdf['scaled_count'] = nwdf['count'].apply(change_scale_word_count)

In [None]:
nwdf

In [None]:
word_freq = dict(zip(nwdf.word, nwdf.scaled_count))

In [None]:
def get_score(excerpt):
    score = 0

    for i in excerpt.split(' '):
        try:
            score += word_freq[i]
        except KeyError:
            pass

    return score

In [None]:
df['excerpt_score'] = df.excerpt.apply(get_score)

In [None]:
df

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
df.excerpt_score.plot()

plt.subplot(1,2,2)
df.ex_len.plot()

In [None]:
plt.scatter(df.ex_len, df.excerpt_score)

In [None]:
plt.scatter(df.target, df.excerpt_score)

In [None]:
plt.figure(figsize=(15,15))
plt.subplot(2,2,1)
sns.histplot(x=df.excerpt_score, hue=df.target, multiple='stack')

plt.subplot(2,2,2)
sns.histplot(x=df.excerpt_score, hue=df.license, multiple='stack')

plt.subplot(2,2,3)
sns.histplot(x=df.excerpt_score, hue=df.url_legal, multiple='stack')