In [None]:

# -*- coding: utf-8 -*-
"""`tldextract` accurately separates the gTLD or ccTLD (generic or country code
top-level domain) from the registered domain and subdomains of a URL.
    >>> import tldextract
    >>> tldextract.extract('http://forums.news.cnn.com/')
    ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
    >>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom
    ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
    >>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
    ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg')
`ExtractResult` is a namedtuple, so it's simple to access the parts you want.
    >>> ext = tldextract.extract('http://forums.bbc.co.uk')
    >>> (ext.subdomain, ext.domain, ext.suffix)
    ('forums', 'bbc', 'co.uk')
    >>> # rejoin subdomain and domain
    >>> '.'.join(ext[:2])
    'forums.bbc'
    >>> # a common alias
    >>> ext.registered_domain
    'bbc.co.uk'
Note subdomain and suffix are _optional_. Not all URL-like inputs have a
subdomain or a valid suffix.
    >>> tldextract.extract('google.com')
    ExtractResult(subdomain='', domain='google', suffix='com')
    >>> tldextract.extract('google.notavalidsuffix')
    ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='')
    >>> tldextract.extract('http://127.0.0.1:8080/deployed/')
    ExtractResult(subdomain='', domain='127.0.0.1', suffix='')
If you want to rejoin the whole namedtuple, regardless of whether a subdomain
or suffix were found:
    >>> ext = tldextract.extract('http://127.0.0.1:8080/deployed/')
    >>> # this has unwanted dots
    >>> '.'.join(ext)
    '.127.0.0.1.'
    >>> # join part only if truthy
    >>> '.'.join(part for part in ext if part)
    '127.0.0.1'
"""

import collections
import logging
import os
from functools import wraps

import idna

"""Helpers """
import errno
import hashlib
import json
import logging
import os
import os.path
import sys
from hashlib import md5

from filelock import FileLock

LOG = logging.getLogger(__name__)

_DID_LOG_UNABLE_TO_CACHE = False


def get_pkg_unique_identifier():
    """
    Generate an identifier unique to the python version, tldextract version, and python instance

    This will prevent interference between virtualenvs and issues that might arise when installing
    a new version of tldextract
    """
    try:
        # pylint: disable=import-outside-toplevel
        from tldextract._version import version
    except ImportError:
        version = "dev"

    tldextract_version = "tldextract-" + version
    python_env_name = os.path.basename(sys.prefix)
    # just to handle the edge case of two identically named python environments
    python_binary_path_short_hash = hashlib.md5(sys.prefix.encode("utf-8")).hexdigest()[:6]
    python_version = ".".join([str(v) for v in sys.version_info[:-1]])
    identifier_parts = [
        python_version,
        python_env_name,
        python_binary_path_short_hash,
        tldextract_version
    ]
    pkg_identifier = "__".join(identifier_parts)

    return pkg_identifier


def get_cache_dir():
    """
    Get a cache dir that we have permission to write to

    Try to follow the XDG standard, but if that doesn't work fallback to the package directory
    http://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html
    """
    cache_dir = os.environ.get("TLDEXTRACT_CACHE", None)
    if cache_dir is not None:
        return cache_dir

    xdg_cache_home = os.getenv("XDG_CACHE_HOME", None)
    if xdg_cache_home is None:
        user_home = os.getenv("HOME", None)
        if user_home:
            xdg_cache_home = os.path.join(user_home, ".cache")

    if xdg_cache_home is not None:
        return os.path.join(xdg_cache_home, "python-tldextract", get_pkg_unique_identifier())

    # fallback to trying to use package directory itself
    return os.path.join(os.path.dirname(__file__), ".suffix_cache/")


class DiskCache:
    """Disk _cache that only works for jsonable values"""

    def __init__(self, cache_dir, lock_timeout=20):
        self.enabled = bool(cache_dir)
        self.cache_dir = os.path.expanduser(str(cache_dir) or "")
        self.lock_timeout = lock_timeout
        # using a unique extension provides some safety that an incorrectly set cache_dir
        # combined with a call to `.clear()` wont wipe someones hard drive
        self.file_ext = ".tldextract.json"

    def get(self, namespace, key):
        """Retrieve a value from the disk cache"""
        if not self.enabled:
            raise KeyError("Cache is disabled")
        cache_filepath = self._key_to_cachefile_path(namespace, key)

        if not os.path.isfile(cache_filepath):
            raise KeyError("namespace: " + namespace + " key: " + repr(key))
        try:
            with open(cache_filepath) as cache_file:
                return json.load(cache_file)
        except (OSError, ValueError) as exc:
            LOG.error("error reading TLD cache file %s: %s", cache_filepath, exc)
            raise KeyError(  # pylint: disable=raise-missing-from
                "namespace: " + namespace + " key: " + repr(key)
            )

    def set(self, namespace, key, value):
        """Set a value in the disk cache"""
        if not self.enabled:
            return False
        cache_filepath = self._key_to_cachefile_path(namespace, key)

        try:
            _make_dir(cache_filepath)
            with open(cache_filepath, "w") as cache_file:
                json.dump(value, cache_file)
        except OSError as ioe:
            global _DID_LOG_UNABLE_TO_CACHE  # pylint: disable=global-statement
            if not _DID_LOG_UNABLE_TO_CACHE:
                LOG.warning(
                    (
                        "unable to cache %s.%s in %s. This could refresh the "
                        "Public Suffix List over HTTP every app startup. "
                        "Construct your `TLDExtract` with a writable `cache_dir` or "
                        "set `cache_dir=False` to silence this warning. %s"
                    ),
                    namespace,
                    key,
                    cache_filepath,
                    ioe,
                )
                _DID_LOG_UNABLE_TO_CACHE = True

        return None

    def clear(self):
        """Clear the disk cache"""
        for root, _, files in os.walk(self.cache_dir):
            for filename in files:
                if filename.endswith(self.file_ext) or filename.endswith(
                    self.file_ext + ".lock"
                ):
                    try:
                        os.unlink(os.path.join(root, filename))
                    except FileNotFoundError:
                        pass
                    except OSError as exc:
                        # errno.ENOENT == "No such file or directory"
                        # https://docs.python.org/2/library/errno.html#errno.ENOENT
                        if exc.errno != errno.ENOENT:
                            raise

    def _key_to_cachefile_path(self, namespace, key):
        namespace_path = os.path.join(self.cache_dir, namespace)
        hashed_key = _make_cache_key(key)

        cache_path = os.path.join(namespace_path, hashed_key + self.file_ext)

        return cache_path

    def run_and_cache(self, func, namespace, kwargs, hashed_argnames):
        """Get a url but cache the response"""
        if not self.enabled:
            return func(**kwargs)

        key_args = {k: v for k, v in kwargs.items() if k in hashed_argnames}
        cache_filepath = self._key_to_cachefile_path(namespace, key_args)
        lock_path = cache_filepath + ".lock"
        try:
            _make_dir(cache_filepath)
        except OSError as ioe:
            global _DID_LOG_UNABLE_TO_CACHE  # pylint: disable=global-statement
            if not _DID_LOG_UNABLE_TO_CACHE:
                LOG.warning(
                    (
                        "unable to cache %s.%s in %s. This could refresh the "
                        "Public Suffix List over HTTP every app startup. "
                        "Construct your `TLDExtract` with a writable `cache_dir` or "
                        "set `cache_dir=False` to silence this warning. %s"
                    ),
                    namespace,
                    key_args,
                    cache_filepath,
                    ioe,
                )
                _DID_LOG_UNABLE_TO_CACHE = True

            return func(**kwargs)

        with FileLock(lock_path, timeout=self.lock_timeout):
            try:
                result = self.get(namespace=namespace, key=key_args)
            except KeyError:
                result = func(**kwargs)
                self.set(namespace="urls", key=key_args, value=result)

            return result

    def cached_fetch_url(self, session, url, timeout):
        """Get a url but cache the response"""
        return self.run_and_cache(
            func=_fetch_url,
            namespace="urls",
            kwargs={"session": session, "url": url, "timeout": timeout},
            hashed_argnames=["url"],
        )


def _fetch_url(session, url, timeout):

    response = session.get(url, timeout=timeout)
    response.raise_for_status()
    text = response.text

    if not isinstance(text, str):
        text = str(text, "utf-8")

    return text


def _make_cache_key(inputs):
    key = repr(inputs)
    try:
        key = md5(key).hexdigest()
    except TypeError:
        key = md5(key.encode("utf8")).hexdigest()
    return key


def _make_dir(filename):
    """Make a directory if it doesn't already exist"""
    if not os.path.exists(os.path.dirname(filename)):
        try:
            os.makedirs(os.path.dirname(filename))
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
'tldextract helpers for testing and fetching remote resources.'

import re
import socket

from urllib.parse import scheme_chars


IP_RE = re.compile(
    r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$')  # pylint: disable=line-too-long

SCHEME_RE = re.compile(r'^([' + scheme_chars + ']+:)?//')


def looks_like_ip(maybe_ip):
    """Does the given str look like an IP address?"""
    if not maybe_ip[0].isdigit():
        return False

    try:
        socket.inet_aton(maybe_ip)
        return True
    except (AttributeError, UnicodeError):
        if IP_RE.match(maybe_ip):
            return True
    except socket.error:
        return False
"tldextract helpers for testing and fetching remote resources."

import logging
import pkgutil
import re

import requests
from requests_file import FileAdapter

LOG = logging.getLogger("tldextract")

PUBLIC_SUFFIX_RE = re.compile(r"^(?P<suffix>[.*!]*\w[\S]*)", re.UNICODE | re.MULTILINE)
PUBLIC_PRIVATE_SUFFIX_SEPARATOR = "// ===BEGIN PRIVATE DOMAINS==="


class SuffixListNotFound(LookupError):
    """A recoverable error while looking up a suffix list. Recoverable because
    you can specify backups, or use this library's bundled snapshot."""


def find_first_response(cache, urls, cache_fetch_timeout=None):
    """Decode the first successfully fetched URL, from UTF-8 encoding to
    Python unicode.
    """
    with requests.Session() as session:
        session.mount("file://", FileAdapter())

        for url in urls:
            try:
                return cache.cached_fetch_url(
                    session=session, url=url, timeout=cache_fetch_timeout
                )
            except requests.exceptions.RequestException:
                LOG.exception("Exception reading Public Suffix List url %s", url)
    raise SuffixListNotFound(
        "No Public Suffix List found. Consider using a mirror or constructing "
        "your TLDExtract with `suffix_list_urls=None`."
    )


def extract_tlds_from_suffix_list(suffix_list_text):
    """Parse the raw suffix list text for its different designations of
    suffixes."""
    public_text, _, private_text = suffix_list_text.partition(
        PUBLIC_PRIVATE_SUFFIX_SEPARATOR
    )

    public_tlds = [m.group("suffix") for m in PUBLIC_SUFFIX_RE.finditer(public_text)]
    private_tlds = [m.group("suffix") for m in PUBLIC_SUFFIX_RE.finditer(private_text)]
    return public_tlds, private_tlds


def get_suffix_lists(cache, urls, cache_fetch_timeout, fallback_to_snapshot):
    """Fetch, parse, and cache the suffix lists"""
    return cache.run_and_cache(
        func=_get_suffix_lists,
        namespace="publicsuffix.org-tlds",
        kwargs={
            "cache": cache,
            "urls": urls,
            "cache_fetch_timeout": cache_fetch_timeout,
            "fallback_to_snapshot": fallback_to_snapshot,
        },
        hashed_argnames=["urls", "fallback_to_snapshot"],
    )


def _get_suffix_lists(cache, urls, cache_fetch_timeout, fallback_to_snapshot):
    """Fetch, parse, and cache the suffix lists"""

    try:
        text = find_first_response(cache, urls, cache_fetch_timeout=cache_fetch_timeout)
    except SuffixListNotFound as exc:
        if fallback_to_snapshot:
            text = pkgutil.get_data("tldextract", ".tld_set_snapshot")
            if not isinstance(text, str):
                text = str(text, "utf-8")
        else:
            raise exc

    public_tlds, private_tlds = extract_tlds_from_suffix_list(text)

    return public_tlds, private_tlds

LOG = logging.getLogger("tldextract")


CACHE_TIMEOUT = os.environ.get("TLDEXTRACT_CACHE_TIMEOUT")

PUBLIC_SUFFIX_LIST_URLS = (
    "https://publicsuffix.org/list/public_suffix_list.dat",
    "https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat",
)


class ExtractResult(collections.namedtuple("ExtractResult", "subdomain domain suffix")):
    """namedtuple of a URL's subdomain, domain, and suffix."""

    # Necessary for __dict__ member to get populated in Python 3+
    __slots__ = ()

    @property
    def registered_domain(self):
        """
        Joins the domain and suffix fields with a dot, if they're both set.
        >>> extract('http://forums.bbc.co.uk').registered_domain
        'bbc.co.uk'
        >>> extract('http://localhost:8080').registered_domain
        ''
        """
        if self.domain and self.suffix:
            return self.domain + "." + self.suffix
        return ""

    @property
    def fqdn(self):
        """
        Returns a Fully Qualified Domain Name, if there is a proper domain/suffix.
        >>> extract('http://forums.bbc.co.uk/path/to/file').fqdn
        'forums.bbc.co.uk'
        >>> extract('http://localhost:8080').fqdn
        ''
        """
        if self.domain and self.suffix:
            # self is the namedtuple (subdomain domain suffix)
            return ".".join(i for i in self if i)
        return ""

    @property
    def ipv4(self):
        """
        Returns the ipv4 if that is what the presented domain/url is
        >>> extract('http://127.0.0.1/path/to/file').ipv4
        '127.0.0.1'
        >>> extract('http://127.0.0.1.1/path/to/file').ipv4
        ''
        >>> extract('http://256.1.1.1').ipv4
        ''
        """
        if not (self.suffix or self.subdomain) and IP_RE.match(self.domain):
            return self.domain
        return ""


class TLDExtract:
    """A callable for extracting, subdomain, domain, and suffix components from
    a URL."""

    # TODO: Agreed with Pylint: too-many-arguments
    def __init__(  # pylint: disable=too-many-arguments
        self,
        cache_dir=get_cache_dir(),
        suffix_list_urls=PUBLIC_SUFFIX_LIST_URLS,
        fallback_to_snapshot=True,
        include_psl_private_domains=False,
        extra_suffixes=(),
        cache_fetch_timeout=CACHE_TIMEOUT,
    ):
        """
        Constructs a callable for extracting subdomain, domain, and suffix
        components from a URL.
        Upon calling it, it first checks for a JSON in `cache_dir`.
        By default, the `cache_dir` will live in the tldextract directory.
        You can disable the caching functionality of this module  by setting `cache_dir` to False.
        If the cached version does not exist (such as on the first run), HTTP request the URLs in
        `suffix_list_urls` in order, until one returns public suffix list data. To disable HTTP
        requests, set this to something falsy.
        The default list of URLs point to the latest version of the Mozilla Public Suffix List and
        its mirror, but any similar document could be specified. Local files can be specified by
        using the `file://` protocol. (See `urllib2` documentation.)
        If there is no cached version loaded and no data is found from the `suffix_list_urls`,
        the module will fall back to the included TLD set snapshot. If you do not want
        this behavior, you may set `fallback_to_snapshot` to False, and an exception will be
        raised instead.
        The Public Suffix List includes a list of "private domains" as TLDs,
        such as blogspot.com. These do not fit `tldextract`'s definition of a
        suffix, so these domains are excluded by default. If you'd like them
        included instead, set `include_psl_private_domains` to True.
        You can pass additional suffixes in `extra_suffixes` argument without changing list URL
        cache_fetch_timeout is passed unmodified to the underlying request object
        per the requests documentation here:
        http://docs.python-requests.org/en/master/user/advanced/#timeouts
        cache_fetch_timeout can also be set to a single value with the
        environment variable TLDEXTRACT_CACHE_TIMEOUT, like so:
        TLDEXTRACT_CACHE_TIMEOUT="1.2"
        When set this way, the same timeout value will be used for both connect
        and read timeouts
        """
        suffix_list_urls = suffix_list_urls or ()
        self.suffix_list_urls = tuple(
            url.strip() for url in suffix_list_urls if url.strip()
        )

        self.fallback_to_snapshot = fallback_to_snapshot
        if not (self.suffix_list_urls or cache_dir or self.fallback_to_snapshot):
            raise ValueError(
                "The arguments you have provided disable all ways for tldextract "
                "to obtain data. Please provide a suffix list data, a cache_dir, "
                "or set `fallback_to_snapshot` to `True`."
            )

        self.include_psl_private_domains = include_psl_private_domains
        self.extra_suffixes = extra_suffixes
        self._extractor = None

        self.cache_fetch_timeout = cache_fetch_timeout
        self._cache = DiskCache(cache_dir)
        if isinstance(self.cache_fetch_timeout, str):
            self.cache_fetch_timeout = float(self.cache_fetch_timeout)

    def __call__(self, url, include_psl_private_domains=None):
        """
        Takes a string URL and splits it into its subdomain, domain, and
        suffix (effective TLD, gTLD, ccTLD, etc.) component.
        >>> extract = TLDExtract()
        >>> extract('http://forums.news.cnn.com/')
        ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
        >>> extract('http://forums.bbc.co.uk/')
        ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
        """

        netloc = (
            SCHEME_RE.sub("", url)
            .partition("/")[0]
            .partition("?")[0]
            .partition("#")[0]
            .split("@")[-1]
            .partition(":")[0]
            .strip()
            .rstrip(".")
        )

        labels = netloc.split(".")

        translations = [_decode_punycode(label) for label in labels]
        suffix_index = self._get_tld_extractor().suffix_index(
            translations, include_psl_private_domains=include_psl_private_domains
        )

        suffix = ".".join(labels[suffix_index:])
        if not suffix and netloc and looks_like_ip(netloc):
            return ExtractResult("", netloc, "")

        subdomain = ".".join(labels[: suffix_index - 1]) if suffix_index else ""
        domain = labels[suffix_index - 1] if suffix_index else ""
        return ExtractResult(subdomain, domain, suffix)

    def update(self, fetch_now=False):
        """Force fetch the latest suffix list definitions."""
        self._extractor = None
        self._cache.clear()
        if fetch_now:
            self._get_tld_extractor()

    @property
    def tlds(self):
        """
        Returns the list of tld's used by default
        This will vary based on `include_psl_private_domains` and `extra_suffixes`
        """
        return list(self._get_tld_extractor().tlds())

    def _get_tld_extractor(self):
        """Get or compute this object's TLDExtractor. Looks up the TLDExtractor
        in roughly the following order, based on the settings passed to
        __init__:
        1. Memoized on `self`
        2. Local system _cache file
        3. Remote PSL, over HTTP
        4. Bundled PSL snapshot file"""

        if self._extractor:
            return self._extractor

        public_tlds, private_tlds = get_suffix_lists(
            cache=self._cache,
            urls=self.suffix_list_urls,
            cache_fetch_timeout=self.cache_fetch_timeout,
            fallback_to_snapshot=self.fallback_to_snapshot,
        )

        if not any([public_tlds, private_tlds, self.extra_suffixes]):
            raise ValueError("No tlds set. Cannot proceed without tlds.")

        self._extractor = _PublicSuffixListTLDExtractor(
            public_tlds=public_tlds,
            private_tlds=private_tlds,
            extra_tlds=list(self.extra_suffixes),
            include_psl_private_domains=self.include_psl_private_domains,
        )
        return self._extractor


TLD_EXTRACTOR = TLDExtract()


@wraps(TLD_EXTRACTOR.__call__)
def extract(
    url, include_psl_private_domains=False
):  # pylint: disable=missing-function-docstring
    return TLD_EXTRACTOR(url, include_psl_private_domains=include_psl_private_domains)


@wraps(TLD_EXTRACTOR.update)
def update(*args, **kwargs):  # pylint: disable=missing-function-docstring
    return TLD_EXTRACTOR.update(*args, **kwargs)


class _PublicSuffixListTLDExtractor:
    """Wrapper around this project's main algo for PSL
    lookups.
    """

    def __init__(
        self, public_tlds, private_tlds, extra_tlds, include_psl_private_domains=False
    ):
        # set the default value
        self.include_psl_private_domains = include_psl_private_domains
        self.public_tlds = public_tlds
        self.private_tlds = private_tlds
        self.tlds_incl_private = frozenset(public_tlds + private_tlds + extra_tlds)
        self.tlds_excl_private = frozenset(public_tlds + extra_tlds)

    def tlds(self, include_psl_private_domains=None):
        """Get the currently filtered list of suffixes."""
        if include_psl_private_domains is None:
            include_psl_private_domains = self.include_psl_private_domains

        return (
            self.tlds_incl_private
            if include_psl_private_domains
            else self.tlds_excl_private
        )

    def suffix_index(self, lower_spl, include_psl_private_domains=None):
        """Returns the index of the first suffix label.
        Returns len(spl) if no suffix is found
        """
        tlds = self.tlds(include_psl_private_domains)
        length = len(lower_spl)
        for i in range(length):
            maybe_tld = ".".join(lower_spl[i:])
            exception_tld = "!" + maybe_tld
            if exception_tld in tlds:
                return i + 1

            if maybe_tld in tlds:
                return i

            wildcard_tld = "*." + ".".join(lower_spl[i + 1 :])
            if wildcard_tld in tlds:
                return i

        return length


def _decode_punycode(label):
    lowered = label.lower()
    looks_like_puny = lowered.startswith("xn--")
    if looks_like_puny:
        try:
            return idna.decode(label.encode("ascii")).lower()
        except (UnicodeError, IndexError):
            pass
    return lowered

In [None]:
# Importing Packages
import pandas as pd
import numpy as np
import spacy
import sys
sys.path = [
    '../input/readability-package',
] + sys.path
import readability
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, pos_tag_sents
from urllib.parse import urlparse
import re
from tldextract import extract

from sklearn import metrics, preprocessing, model_selection
import lightgbm as lgb

In [None]:
# Reading Data
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.head()

In [None]:
test.shape

In [None]:
train.describe(include='all')

# Feature Engineering

## Readability feature

In [None]:
# Taken this from https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline
def readability_measurements(passage: str):
    """
    This function uses the readability library for feature engineering.
    It includes textual statistics, readability scales and metric, and some pos stats
    """
    results = readability.getmeasures(passage, lang='en')
    
    chars_per_word = results['sentence info']['characters_per_word']
    syll_per_word = results['sentence info']['syll_per_word']
    words_per_sent = results['sentence info']['words_per_sentence']
    
    kincaid = results['readability grades']['Kincaid']
    ari = results['readability grades']['ARI']
    coleman_liau = results['readability grades']['Coleman-Liau']
    flesch = results['readability grades']['FleschReadingEase']
    gunning_fog = results['readability grades']['GunningFogIndex']
    lix = results['readability grades']['LIX']
    smog = results['readability grades']['SMOGIndex']
    rix = results['readability grades']['RIX']
    dale_chall = results['readability grades']['DaleChallIndex']
    
    tobeverb = results['word usage']['tobeverb']
    auxverb = results['word usage']['auxverb']
    conjunction = results['word usage']['conjunction']
    pronoun = results['word usage']['pronoun']
    preposition = results['word usage']['preposition']
    nominalization = results['word usage']['nominalization']
    
    pronoun_b = results['sentence beginnings']['pronoun']
    interrogative = results['sentence beginnings']['interrogative']
    article = results['sentence beginnings']['article']
    subordination = results['sentence beginnings']['subordination']
    conjunction_b = results['sentence beginnings']['conjunction']
    preposition_b = results['sentence beginnings']['preposition']

    
    return [chars_per_word, syll_per_word, words_per_sent,
            kincaid, ari, coleman_liau, flesch, gunning_fog, lix, smog, rix, dale_chall,
            tobeverb, auxverb, conjunction, pronoun, preposition, nominalization,
            pronoun_b, interrogative, article, subordination, conjunction_b, preposition_b]

## spacy feature

In [None]:
# Taken this from https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline
def spacy_features(df: pd.DataFrame):
    """
    This function generates features using spacy en_core_wb_lg
    I learned about this from these resources:
    https://www.kaggle.com/konradb/linear-baseline-with-cv
    https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners
    """
    
    nlp = spacy.load('en_core_web_lg')
    with nlp.disable_pipes():
        vectors = np.array([nlp(text).vector for text in df.excerpt])
        
    return vectors

def get_spacy_col_names():
    names = list()
    for i in range(300):
        names.append(f"spacy_{i}")
        
    return names

## pos tag features

In [None]:
# Taken this from https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline
def pos_tag_features(passage: str):
    """
    This function counts the number of times different parts of speech occur in an excerpt
    """
    pos_tags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"]
    
    tags = pos_tag(word_tokenize(passage))
    tag_list= list()
    
    for tag in pos_tags:
        tag_list.append(len([i[0] for i in tags if i[1] == tag]))
    
    return tag_list

## Others

In [None]:
# Taken this from https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline
def generate_other_features(passage: str):
    """
    This function is where I test miscellaneous features
    This is experimental
    """
    # punctuation count
    periods = passage.count(".")
    commas = passage.count(",")
    semis = passage.count(";")
    exclaims = passage.count("!")
    questions = passage.count("?")
    
    # Some other stats
    num_char = len(passage)
    num_words = len(passage.split(" "))
    unique_words = len(set(passage.split(" ") ))
    word_diversity = unique_words/num_words
    
    word_len = [len(w) for w in passage.split(" ")]
    longest_word = np.max(word_len)
    avg_len_word = np.mean(word_len)
    
    return [periods, commas, semis, exclaims, questions,
            num_char, num_words, unique_words, word_diversity,
            longest_word, avg_len_word]

## Combine All

In [None]:
def extract_features(df):

    scores_df = pd.DataFrame(df["excerpt"].apply(lambda p : readability_measurements(p)).tolist(), 
                                 columns=["chars_per_word", "syll_per_word", "words_per_sent",
                                          "kincaid", "ari", "coleman_liau", "flesch", "gunning_fog", "lix", "smog", "rix", "dale_chall",
                                          "tobeverb", "auxverb", "conjunction", "pronoun", "preposition", "nominalization",
                                          "pronoun_b", "interrogative", "article", "subordination", "conjunction_b", "preposition_b"])
    df = pd.merge(df, scores_df, left_index=True, right_index=True)
    
    spacy_df = pd.DataFrame(spacy_features(df), columns=get_spacy_col_names())
    df = pd.merge(df, spacy_df, left_index=True, right_index=True)
    
    pos_df = pd.DataFrame(df["excerpt"].apply(lambda p : pos_tag_features(p)).tolist(),
                            columns=["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                                    "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                                    "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"])
    df = pd.merge(df, pos_df, left_index=True, right_index=True)
    
    other_df = pd.DataFrame(df["excerpt"].apply(lambda p : generate_other_features(p)).tolist(),
                            columns=["periods", "commas", "semis", "exclaims", "questions",
                                        "num_char", "num_words", "unique_words", "word_diversity",
                                        "longest_word", "avg_len_word"])
    df = pd.merge(df, other_df, left_index=True, right_index=True)

    return df

extract_features(test)

## URL Features

In [None]:
from urllib.parse import urlparse
import re
from tldextract import extract

In [None]:
def extract_url_license_feat(df):
    
    temp = pd.DataFrame()
#     temp['path'] = df['url_legal'].apply(lambda x : x if x is np.nan else urlparse(x).path)
    temp['article_year'] = df['url_legal'].apply(lambda x : x if x is np.nan else re.search('(2\d{3})|$', urlparse(x).path).group())
    temp['subdomain'] = df['url_legal'].apply(lambda x : x if x is np.nan else extract(x)[0])
    temp['domain'] = df['url_legal'].apply(lambda x : x if x is np.nan else extract(x)[1])
    temp['suffix'] = df['url_legal'].apply(lambda x : x if x is np.nan else extract(x)[2])
    temp['is_pdf'] = df['url_legal'].apply(lambda x : 1 if '.pdf' in str(x) else 0)
    
    temp['is_cc'] = df['license'].apply(lambda x : 1 if 'CC' in str(x) else 0)
    temp['is_by'] = df['license'].apply(lambda x : 1 if 'BY' in str(x) else 0)
    temp['is_sa'] = df['license'].apply(lambda x : 1 if 'SA' in str(x) else 0)
    temp['is_nc'] = df['license'].apply(lambda x : 1 if 'NC' in str(x) else 0)
    temp['is_nd'] = df['license'].apply(lambda x : 1 if 'ND' in str(x) else 0)
    temp['is_gnu'] = df['license'].apply(lambda x : 1 if 'GNU' in str(x) else 0)
    temp['license_version'] = df['license'].apply(lambda x : x if x is np.nan else re.search('([0-9][.][0-9])|$', urlparse(x).path).group())
    
    df = pd.concat([df, temp], axis = 1)
    
    return df

# Extract Features

In [None]:
train_feat = extract_features(train)
train_feat = extract_url_license_feat(train_feat)
train_feat.head()

In [None]:
test_feat = extract_features(test)
test_feat = extract_url_license_feat(test_feat)
test_feat.head()

train_feat = pd.read_csv('../input/features/train_feat.csv')
test_feat = pd.read_csv('../input/features/test_feat.csv')

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
ignore_cols = ['id','url_legal','license','excerpt', 'standard_error', 'target']

In [None]:
for col in train_feat.select_dtypes('object').columns.tolist():
    if col not in ignore_cols:
        train_feat[col] = train_feat[col].replace(np.nan, '')
        test_feat[col] = test_feat[col].replace(np.nan, '')
        lbl = LabelEncoder()
        train_feat[col] = lbl.fit_transform(train_feat[col])
        test_feat[col] = lbl.transform(test_feat[col])

In [None]:
train_feat.head()

In [None]:
X_train = train_feat[[i for i in train_feat.columns if i not in ignore_cols]]
y_train = train_feat['target']
test_X = test_feat[[i for i in test_feat.columns if i not in ignore_cols]]

In [None]:
[i for i in train_feat.columns if i not in test_feat.columns]

In [None]:
print(X_train.shape)
print(test_X.shape)

# Standard Error Weighting

In [None]:
std_error = train_feat['standard_error']
std_error = 1/std_error
std_error = std_error.replace(np.inf, 0)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
std_error = scaler.fit_transform(std_error.values.reshape(-1, 1))

In [None]:
std_error = std_error.reshape(-1)
std_error

# Model Build

### LighGBM Regression

In [None]:
from sklearn import metrics, preprocessing, model_selection
import lightgbm as lgb

In [None]:
def runLGB_reg(train_X, train_y, test_X, sample_weight, test_y=None, test_X2=None, dep=8, seed=0, data_leaf=50, rounds=20000):
    params = {}
    params["objective"] = "regression"
    params['metric'] = 'rmse'
    params["max_depth"] = dep
    params["num_leaves"] = 30
    params["min_data_in_leaf"] = data_leaf
    #     params["min_sum_hessian_in_leaf"] = 50
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.8
    params["feature_fraction"] = 0.2
    params["feature_fraction_seed"] = seed
    params["bagging_freq"] = 1
    params["bagging_seed"] = seed
    params["lambda_l2"] = 3
    params["lambda_l1"] = 3
    params["verbosity"] = -1
    print(sample_weight)
#     params["sample_weight"] = sample_weight
    num_rounds = rounds

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=500)

    #         model = lgb.LGBMRegressor()
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    #imps = model.feature_importance()
    #names = model.feature_name()
    #for fi, fn in enumerate(names):
    #    print(fn, imps[fi])

    loss = 0
    if test_y is not None:
        loss = np.sqrt(metrics.mean_squared_error(test_y, pred_test_y))
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [None]:
print("Building model..")
cv_scores = []
pred_test_full = 0
pred_train = np.zeros(X_train.shape[0])
n_splits = 5
kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=7988)
# gkf = model_selection.GroupKFold(n_splits=n_splits)
model_name = "lgb"
for dev_index, val_index in kf.split(X_train, y_train):
    dev_X, val_X = X_train.iloc[dev_index,:], X_train.iloc[val_index,:]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    std_error_x = std_error[dev_index]

    pred_val = 0
    pred_test = 0
    n_models = 0.

    model, loss, pred_v, pred_t = runLGB_reg(dev_X, dev_y, val_X, std_error_x, val_y, test_X, dep=6, data_leaf=200, seed=2019)
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
    model, loss, pred_v, pred_t = runLGB_reg(dev_X, dev_y, val_X, std_error_x, val_y, test_X,  dep=7, data_leaf=180, seed=9873)
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
#     model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X, dep=7, data_leaf=200, seed=4568)
#     pred_val += pred_v
#     pred_test += pred_t
#     n_models += 1
    
    
    pred_val /= n_models
    pred_test /= n_models
    
    loss = np.sqrt(metrics.mean_squared_error(val_y, pred_val))
        
    pred_train[val_index] = pred_val
    pred_test_full += pred_test / n_splits
    cv_scores.append(loss)
    print(cv_scores)
#     break
print(np.mean(cv_scores))

In [None]:
pred_test

In [None]:
pred_test

In [None]:
submission_df = pd.DataFrame({'id': test.id, 'target': pred_test})
submission_df

In [None]:
submission_df.to_csv('submission.csv', index = False)