## L2 Language Learner Classification 

Building a classifier to distinguish English text written by Lang-8 users whose native language (L1) is another European language (French and Spanish) from those written by L1 speakers of East Asian languages (Japanese, Korean, and Mandarin Chinese).

### Importing Libraries

In [1]:
from zipfile import ZipFile
from bs4 import BeautifulSoup

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk

nltk.download('words')
nltk.download("cmudict")

from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.corpus import cmudict

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

import string

[nltk_data] Downloading package words to
[nltk_data]     /Users/snehajhaveri/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/snehajhaveri/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


### Text Extraction

In [2]:
def read_file_from_zip(path="data/lang-8.zip"):
    """
    A generator function which reads html documents
    as raw text from the zip file

    Parameters
    ----------
    path : string
        path to the zip file

    Returns
    -------
    A dictionary of filename and raw text extracted
    from the file
    """
    archive = ZipFile(path, "r")

    for file in archive.namelist()[1:]:
        yield {
            "filename": file,#.removeprefix("lang-8/"),
            "data": archive.read(file)
        }

In [3]:
def extract_data_from_file(path="data/lang-8.zip"):
    """
    A generator function which reads html files from zip
    and extracts text and native language from the raw
    text

    Parameters
    ----------
    path : string
        path to the zip file

    Returns
    -------
    A dictionary of extracted content and native language
    of the author
    """
    for data_dict in read_file_from_zip(path):
        soup = BeautifulSoup(data_dict["data"])

#         author = soup.find_all("p", attrs={"class": "spaced"})[1].get_text().strip()
        native_lang = soup.find("li", attrs={"data-title": "Native language"}).get_text().strip()
        filename = data_dict["filename"]
        text = soup.find("div", attrs={"id": "body_show_ori"}).get_text().strip()

        preprocessed_data = {
            "text": text,
#             "author": author,
            "native_lang": native_lang,
            "filename": filename
        }

        yield preprocessed_data

### Feature Extraction

#### Text Length

In [4]:
def get_text_length(text):
    """
    Returns the number of words in a text without punctuations. 
    Counts clitics as separate words.

    Parameters
    ----------
    text : str
        A text from which we find the number of words

    Returns
    -------
    An int which represents the number of words in the text
    """
    non_punc = []
    for word in word_tokenize(text):
        if word not in string.punctuation:
            non_punc.append(word)
    return len(non_punc)

#### Lexical Density

In [5]:
def get_lexical_density(text):
    """
    Returns the lexical density of a text. That is the ratio of open class words.
    in the text

    Parameters
    ----------
    text : str
        A text from which we find the lexical density

    Returns
    -------
    A float which represents the lexical density
    """
    open_class_prefix = {"N", "V", "J", "R"}
    open_class_total = 0
    word_count = 0
    if len(text) == 0:
        return float(0)
    for word, pos in pos_tag(word_tokenize(text)):
        if word not in string.punctuation:
            word_count += 1
            if pos[0] in open_class_prefix:
                open_class_total += 1
    return open_class_total/word_count

#### Average Sentence Length

In [6]:
def get_average_sentence_length(text):
    """
    Returns the average sentence length of a text. Does not count punctuations and counts clitics.

    Parameters
    ----------
    text : str
        A text from which we find the average sentence length

    Returns
    -------
    A float which represents the average sentence length
    """
    if len(text) == 0:
        return float(0)
    sent_lengths = 0
    for sentence in sent_tokenize(text):
        word_count = 0
        for word in word_tokenize(sentence):
            if word not in string.punctuation:
                word_count += 1
        sent_lengths += word_count
    return sent_lengths/len(sent_tokenize(text))

#### Average Word Length

In [7]:
def get_average_word_length(text):
    """
    Returns the average sentence length of a text. Does not count punctuations 
    and counts clitics.

    Parameters
    ----------
    text : str
        A text from which we find the average sentence length

    Returns
    -------
    A float which represents the average sentence length
    """
    if len(text) == 0:
        return float(0)
    word_count = 0
    lengths_sum = 0
    for word in word_tokenize(text):
        if word not in string.punctuation:
            lengths_sum += len(word)
            word_count += 1
    return lengths_sum/word_count

In [8]:
s0 = ""
s1 = """I went to the park today. 
I love going there because I always have so much fun. 
I invited some friends but they didn't come. 
That's fine because I met a new person there. 
He had a dog.
""" #40, 
s2 = "I have so much work to do today. I am stressed" #11

# get_text_length
assert type(get_text_length(s0)) == int, "Must be an interger"
assert get_text_length(s0) == 0, "Empty string must return 0"
assert get_text_length(s1) == 40, "s1 has 40 words"
assert get_text_length(s2) == 11, "s2 has 11 words"
print("get_text_length tests pass")

assert type(get_lexical_density(s0))== float, "Must be a float"
assert get_lexical_density(s0) == 0, "Empty string must return 0"
assert get_lexical_density(s1) == 24/40, "24 open class words out of 40"
assert get_lexical_density(s2) == 8/11, "8 open class words out of 40"
print("get_lexical_density tests pass")

assert type(get_average_sentence_length(s0)) == float, "Must be a float"
assert get_average_sentence_length(s0) == 0, "Empty string must return 0"
assert get_average_sentence_length(s1) == 40/5, "40 words over the span of 5 sentences"
assert get_average_sentence_length(s2) == 11/2, "11 words over the span 2 sentences"
print("get_average_sentence_length tests pass")

assert type(get_average_word_length(s0)) == float, "Must be a float"
assert get_average_word_length(s0) == 0, "Empty string must return 0"
assert get_average_word_length(s1) == 142/40, "142 total characters spread across 40 words"
assert get_average_word_length(s2) == 35/11, "35 character spread across 11 words"
print("get_average_word_length tests pass")

get_text_length tests pass
get_lexical_density tests pass
get_average_sentence_length tests pass
get_average_word_length tests pass


#### Part of Speech (POS) Count

In [9]:
def get_pos_count(text):
    """
    Counts the number of nouns, verbs and adjectives in a text.

    Parameters
    ----------
    text : str
        A text for which we find the number of nouns, verbs
        and adjectives

    Returns
    -------
    A tuple of (noun_count: int, verb_count: int, adj_count: int)
    which represents the number of nouns, verbs adjectives in the text
    respectively
    """
    noun_count = 0
    verb_count = 0
    adj_count = 0

    if len(text) == 0:
        return 0, 0, 0

    for word, pos in pos_tag(word_tokenize(text)):
        if(pos[0] == 'N'):
            noun_count += 1
        if(pos[0] == 'V'):
            verb_count += 1
        if(pos == 'JJ'):
            adj_count += 1
    return noun_count, verb_count, adj_count

In [10]:
s1 = """I went to the park today. 
I love going there because I always have so much fun. 
I invited some friends but they didn't come. 
That's fine because I met a new person there. 
He had a dog."""

s2 = """Chelsea English School is offering a Summer School Program in Iwaki, Fukushima, a holiday learning experience combining enjoyment of the area's natural beauty and practical lifestyle immersion in the agricultural traditions of this part of Japan.
We will be hosted by "Namakiba" farm, an agricultural concern run by an Iwaki City cooperative, and activities include handson experience of organic farming,barbecues, local nature sightseeing including swimming in the river and the sea,the local fish market, guesthouses with onsens (hot spas) . The program promises new and fresh experiences in both nature and culture, and time will also be made available for gift shopping. Non-Japanese speakers are also warmly invited, as simultaneous translation into English will be available throughout the e trip."""

assert get_pos_count(s1) == (6, 10, 3)
assert get_pos_count(s2) == (47, 17, 16)

print("get_pos_count tests pass")

get_pos_count tests pass


#### Out of Vocabulary Words

In [11]:
def get_num_ovv_words(text):
    """
    Gets the number of out-of-vocabulary words in a text.

    Parameters
    ----------
    text : str
        A text for which we find the number of out-of-vocabulary
        words is to be found

    Returns
    -------
    The number of oov words in the text
    """
    text_vocab = set(w.lower() for w in text.split() if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    ovv_words = text_vocab - english_vocab

    return len(ovv_words)

In [12]:
s0 = ""
s1 = """ I haddd to leaasve earliae since yesterday was so tired.
And then I met you.
""" 
s2 = "I have so much work to do today. I am stressseed"
assert type(get_num_ovv_words(s0)) == int, "Must be an interger"
assert get_num_ovv_words(s0) == 0, "Empty string must return 0"
assert get_num_ovv_words(s1) == 3, "s1 has 3 words out of vaocab"
assert get_num_ovv_words(s2) == 1, "s2 has 1 words out of vocab"
print("get_num_ovv_words tests pass")

get_num_ovv_words tests pass


#### Reading Ease

In [13]:

# Code adapted from lab

vowels = {"a","e","i","o","u","y"}
p_dict = cmudict.dict()

def get_reading_ease(text):
    """Returns the reading ease for a text.

    Parameters
    ----------
    text : str
       A text for which we find the reading ease.

    Returns
    -------
    reading_ease : float
        The reading ease for the text
    """
    syllable_count = 0
    word_count = 0

    for word in word_tokenize(text):
        if word not in string.punctuation:
            word_count += 1
            if word in p_dict:
                for pron in p_dict[word][0]:
                    if pron[-1] in ['0','1','2']:
                        syllable_count +=1
            else:
                for j in range(0,len(word)):
                    if word[j].lower() in vowels:
                         syllable_count= syllable_count+1

    reading_ease = (206.835 - (1.015*(word_count/len(sent_tokenize(text))))- (84.6*(syllable_count/word_count)))
    return reading_ease

In [14]:
assert 100 < get_reading_ease("I am done, man") < 140
assert -60 < get_reading_ease("Felicitations for achieving a thoroughly excellent resolution to an altogether indombidable conundrum of humongous proportions.") <-20
print("get_reading_ease tests pass")

get_reading_ease tests pass


#### Punctuation Counts

In [15]:
def get_punctuations_count(text):
    """
    Returns the number of punctuations in a text.

    Parameters
    ----------
    text : str
        A text for which we find the number of punctuations present

    Returns
    -------
    punct_count: int
                 An integer which represents the number of punctuations in the text
    """
    punct_count = 0
    if len(text) == 0:
        return 0
    for word in word_tokenize(text):
        if word in string.punctuation:
            punct_count += 1
    return punct_count

In [16]:
s1 = """I went to the park today. 
I love going there because I always have so much fun. 
I invited some friends but they didn't come. 
That's fine because I met a new person there. 
He had a dog."""

s2 = """Chelsea English School is offering a Summer School Program in Iwaki, Fukushima, a holiday learning experience combining enjoyment of the area's natural beauty and practical lifestyle immersion in the agricultural traditions of this part of Japan.
We will be hosted by "Namakiba" farm, an agricultural concern run by an Iwaki City cooperative, and activities include handson experience of organic farming,barbecues, local nature sightseeing including swimming in the river and the sea,the local fish market, guesthouses with onsens (hot spas) . The program promises new and fresh experiences in both nature and culture, and time will also be made available for gift shopping. Non-Japanese speakers are also warmly invited, as simultaneous translation into English will be available throughout the e trip."""


assert get_punctuations_count(s1) == 5
assert get_punctuations_count(s2) == 16

print("get_punctuations_count tests pass")

get_punctuations_count tests pass


#### Type Token Ratio

In [19]:
def get_type_token_ratio(text):
    """
    Calculate type-token ratio from the text using the first
    num_words tokens

    Parameters
    ----------
    text : str
        A text for which we find the type-token ratio

    Returns
    -------
    type_token_ratio: int
                    An integer which represents the type token ratio for a given text
    """
    words = text.split()
    num_words = 100
    type_set = set(word.lower() for word in words[:num_words])
    return len(type_set) / num_words

In [20]:
s1 = """Chelsea English School is offering a Summer School Program in Iwaki, Fukushima, a holiday learning experience combining enjoyment of the area's natural beauty and practical lifestyle immersion in the agricultural traditions of this part of Japan.
We will be hosted by "Namakiba" farm, an agricultural concern run by an Iwaki City cooperative, and activities include handson experience of organic farming,barbecues, local nature sightseeing including swimming in the river and the sea,the local fish market, guesthouses with onsens (hot spas) . The program promises new and fresh experiences in both nature and culture, and time will also be made available for gift shopping. Non-Japanese speakers are also warmly invited, as simultaneous translation into English will be available throughout the e trip."""

s2 = """I'd like to acquire this skill, however it doesn't really fit into my schedule right now. I'm still practicing a little bit every day.Well, I'll attend my exam within 8 weeks and a few days, but I've still many things to learn. That's what we call a challenge, so I find it quite interesting."""
assert get_type_token_ratio(s1) == 0.74
assert get_type_token_ratio(s2) == 0.48

print("get_type_token_ratio tests pass")

get_type_token_ratio tests pass


#### Asian Context

In [17]:
def get_asian_context_feature(text):
    """
    Return a binary value based on whether asian journal's context based words are present in the text

    Parameters
    ----------
    text : str
        A text for which we find the presence of the words

    Returns
    -------
    value : boolean
            0 represents that the no word in the list is present in the text and 1 represents vice versa.
    """

    lemmatizer = WordNetLemmatizer()
    with open("data/asian_words.txt", "r") as file:
        asian_journals_context_words = file.read().split("\n")

    for word in word_tokenize(text):
        if lemmatizer.lemmatize(word) in asian_journals_context_words:
            return 1
    return 0

In [18]:
asian_journal_test = """Frank moved to Guangzhou after long time consideration from another city. Finally he settled down in his new apartment and we had a welcome dinner together last night in a Indonesian restaurant which was absolutely new for me. 
I have not tried Indonesian food before but similar ones such as Singaporean and Malaysian dishes when I traveled there back to 2007. Southeast Asia food is full of spicy, curry tasting in common. I like it since the influence I got from my previous company, a Singapore based firm in Shanghai. I had good memory both about my pre-boss and good trip in Singapore and Malaysia. I was treated with lots of fun, adventure, foods and for sure can not be forgotten, humid and hot weather. 
Ok, back to the Indonesian restaurant. It is a tradtional one decorating with local stuff, french window, cane chair with the gentle local background music. I can smell the Indonesian in air. The dishes were really good and mostly important the price is fair too."""

asian_journal_test_2 = """Today I had TV conference with Malaysian in English.I know we Japanese have strong accent ourselves, but I think for me their English is very difficult to understand as well. I would like to get used to their accent."""

european_journal_test = """I am missing the friends, and I will miss my life in US. It hasn't been easy for me to stay here, mainly because of my English limitation, but now I wish I could stay longer here. Because I like California."""

european_journal_test_2 = """A few days ago, I've discovered something pretty awesome: it is called penmanship.
It could be decribed as the art of writing. After much practice, the results are really great."""

assert get_asian_context_feature(asian_journal_test) == 1
assert get_asian_context_feature(european_journal_test) == 0
assert get_asian_context_feature(asian_journal_test_2) == 1
assert get_asian_context_feature(european_journal_test_2) == 0

print("get_asian_context_feature tests pass")

get_asian_context_feature tests pass


#### Word Importance (TF - IDF)

In [20]:
def get_word_importance(text):
    """This is a helper function to generate TF IDF scores for words present in the input text"""
    count = CountVectorizer(stop_words='english', analyzer='word')
    word_count = count.fit_transform(text)

    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfidf_transformer.fit(word_count)
    tf_idf_vector = tfidf_transformer.transform(word_count)

    feature_names = count.get_feature_names_out()
    first_document_vector = tf_idf_vector[0]

    df_tfifd = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
    df_tfifd = df_tfifd.sort_values(by=["tfidf"], ascending=False)

    return df_tfifd

In [21]:
def get_top_important_words():
    """This is a helper function to fetch words with top TF IDF scores for both asian and european journals"""

    no_of_top_words = 30

    asian_lang_text = []
    non_asian_lang_text = []

    lemmatizer = WordNetLemmatizer()
    asian_lang = ["Korean", "Japanese", "Mandarin Chinese"]

    for extracted_data in extract_data_from_file():
        if extracted_data['native_lang'] in asian_lang:
            asian_lang_text.append(extracted_data['text'] + ", ")
        else:
            non_asian_lang_text.append(extracted_data['text'] + ", ")

    asian_imp_words_df = get_word_importance(asian_lang_text)
    non_asian_imp_words_df = get_word_importance(non_asian_lang_text)

    asian_imp_words = asian_imp_words_df.index[:no_of_top_words]
    asian_imp_words_lem = [lemmatizer.lemmatize(x) for x in asian_imp_words]

    non_asian_imp_words = non_asian_imp_words_df.index[:no_of_top_words]
    non_asian_imp_words_lem = [lemmatizer.lemmatize(x) for x in non_asian_imp_words]

    return asian_imp_words_lem, non_asian_imp_words_lem

In [22]:
asian_imp_words_index, non_asian_imp_words_index = get_top_important_words()

#### Asian Important Words

In [24]:
def get_imp_words_asian_feature(text):
    """
    Return a binary value based on whether asian journal's important words (based on TF-IDF scores) are present in the text

    Parameters
    ----------
    text : str
        A text for which we find the presence of the words

    Returns
    -------
    value : boolean
            0 represents that the no word in the list is present in the text and 1 represents vice versa.
    """
    lemmatizer = WordNetLemmatizer()

    for word in word_tokenize(text):
        if lemmatizer.lemmatize(word) in asian_imp_words_index:
            return 1

    return 0

#### Non Asian Important Words

In [25]:
def get_imp_words_non_asian_feature(text):
    """
    Return a binary value based on whether european journal's important words (based on TF-IDF scores) are present in the text

    Parameters
    ----------
    text : str
        A text for which we find the presence of the words

    Returns
    -------
    value : boolean
            0 represents that the no word in the list is present in the text and 1 represents vice versa.
    """
    lemmatizer = WordNetLemmatizer()

    for word in word_tokenize(text):
        if lemmatizer.lemmatize(word) in non_asian_imp_words_index:
            return 1
    return 0

In [26]:
test_text_asian_feature = """Hi, my name is Sebastián. This is the first time I write here at Lang-8, and to be honest it’s hard for me to decide what to talk about, but if you don’t make up your mind and practice you’ll never improve, so here I go…
I’m studying English and Japanese. I use the computer and internet a lot for studying, which has given me access to several and great tools that every language learner should have access to, like blogs, forums, podcasts, sowftware and others..
As for English, its pronunciation is hard, but reading isn’t that much hard. Besides, on internet there are amazing amounts of information in English about any subject. For example, I like stuff related to the use of computers and internet for both leisure and learning, and read sites like Lifehacker and Wikipedia and several others everyday. Actually, I’ve learned a lot of English by reading and writing at sites about learning Japanese.
On the other side, reading Japanese is something that takes lots of time and effort. Even something as “simple” as reading the newspaper isn’t that “simple” at all. Fortunately, I study kanji using a book called “Remembering the Kanji” by James Heisig, and also spaced repetition softwares for kanji and Japanese in general, which has helped me a lot not just to actually learn, but also to feel much more motivated to keep on learning.
As I said above, the use of the computer and internet has helped me a lot. I think this is the best moment to learn languages, as practically anything you need can be found on internet, and even for free. For example, Lang-8 is a great tool for every language learner, so I hope it keeps on having great success.
What do you think?"""

test_text_non_asian_feature = """A few days ago, I've discovered something pretty awesome: it is called penmanship.
It could be decribed as the art of writing. After much practice, the results are really great."""


assert get_imp_words_asian_feature(test_text_asian_feature) == 1
assert get_imp_words_non_asian_feature(test_text_asian_feature) == 0
assert get_imp_words_non_asian_feature(test_text_non_asian_feature) == 1
assert get_imp_words_asian_feature(test_text_non_asian_feature) == 0

print("get_imp_words_asian_feature tests pass")
print("get_imp_words_non_asian_feature tests pass")

get_imp_words_asian_feature tests pass
get_imp_words_non_asian_feature tests pass


In [27]:
def get_document_list(txt_path):
    """
    Extracts the list of documents stores in a text file

    Parameters
    ----------
    text_path : str
        The string defining path of the text document

    Returns
    -------
    A list of filenames extracted from the file
    """
    doc_list = []

    with open(txt_path, "r") as f:
        for filename in f.readlines():
            doc_list.append(filename.strip())

    return doc_list

In [28]:
def extract_all_features(
    txt_path, csv_path, zip_path="data/lang-8.zip", verbose=False
):
    """
    Reads the zip file from path, extracts features from
    preprocessed text and combines them together to save
    them to a csv file

    Parameters
    ----------
    csv_path : str
            path at which the generated csv file is to be saved
    zip_path : str
            path to the zip file
    verbose : boolean
            specify whether to print the processed filename or not
    """

    # Return if the file already exists
    if os.path.isfile(csv_path):
        return

    # Lists of relevant features
    asian_lang = ["Korean", "Japanese", "Mandarin Chinese"]
    names = []
    text_lens = []
    lexical_densities = []
    avg_sent_lens = []
    avg_word_lens = []
    oov_word_counts = []

    reading_eases = []
    punctuations_counts = []
    type_token_ratios = []
    asian_context_features = []
    imp_words_asian_features = []
    imp_words_non_asian_features = []

    noun_counts, verb_counts, adj_counts = [], [], []
    targets = []

    # Lists of training, validation and test files
    doc_list = get_document_list(txt_path)

    for extracted_data in extract_data_from_file(zip_path):

        if extracted_data["filename"].removeprefix("lang-8/") not in doc_list:
            continue

        if extracted_data["native_lang"] == "Russian":
            continue

        if extracted_data['native_lang'] in asian_lang:
            target = 1
        else:
            target = 0

        targets.append(target)
        names.append(extracted_data['filename'][7:-5])
        text_lens.append(get_text_length(extracted_data['text']))
        lexical_densities.append(get_lexical_density(extracted_data['text']))
        avg_sent_lens.append(get_average_sentence_length(extracted_data['text']))
        avg_word_lens.append(get_average_word_length(extracted_data['text']))
        oov_word_counts.append(get_num_ovv_words(extracted_data['text']))

        reading_eases.append(get_reading_ease(extracted_data['text']))
        punctuations_counts.append(get_punctuations_count(extracted_data['text']))
        type_token_ratios.append(get_type_token_ratio(extracted_data['text']))
        asian_context_features.append(get_asian_context_feature(extracted_data['text']))
        imp_words_asian_features.append(get_imp_words_asian_feature(extracted_data['text']))
        imp_words_non_asian_features.append(get_imp_words_non_asian_feature(extracted_data['text']))

        noun_count, verb_count, adj_count = get_pos_count(extracted_data['text'])
        noun_counts.append(noun_count)
        verb_counts.append(verb_count)
        adj_counts.append(adj_count)

        if verbose:
            print(len(targets), extracted_data["filename"])

    feature_df = pd.DataFrame(
        np.array([
            names,
            text_lens,
            lexical_densities,
            avg_sent_lens,
            avg_word_lens,
            oov_word_counts,

            reading_eases,
            punctuations_counts,
            type_token_ratios,
            asian_context_features,
            imp_words_asian_features,
            imp_words_non_asian_features,

            noun_counts,
            verb_counts,
            adj_counts,
            targets
        ]).T,
        columns=[
            "filename",
            "text_length",
            "lexical_density",
            "average_sentence_length",
            "average_word_length",
            "oov_word_counts",

            "reading_ease",
            "punctuation_count",
            "type_token_ratio",
            "asian_context_feature",
            "asian_imp_word",
            "non_asian_imp_word",

            "noun_counts",
            "verb_counts",
            "adjective_counts",
            "target_region"
        ])

    feature_df.to_csv(csv_path)

In [29]:
def create_train_dev_test_csvs(paths={
                                    "data/train.txt": "data/train.csv",
                                    "data/dev.txt": "data/dev.csv",
                                    "data/test.txt": "data/test.csv"
                                },
                               zip_path="data/lang-8.zip"):
    """
    Takes in paths of text documents containing filenames from which
    information is to be extracted, extracts informtion from them and
    store them as csvs for train, validation and test

    Parameters
    ----------
    paths : dict
        a dictionary with keys as paths for text documents to read filenames
        and values as paths for the csv documents to save extracted features
    zip_path : str
            path to the zip file
    """

    for txt_path, csv_path in paths.items():
        extract_all_features(txt_path, csv_path, zip_path)

In [31]:
def read_csvs(train, val, test):
    """
    Reads train, validation and test sets from disk

    Parameters
    ----------
    train : str
        The path of the training csv file
    train : str
        The path of the training csv file
    train : str
        The path of the training csv file

    Returns
    -------
    A tuple of train, validation and test dataframes
    """
    train_csv = None
    val_csv = None
    test_csv = None

    try:
        train_csv = pd.read_csv(train)
    except:
        pass
    try:
        val_csv = pd.read_csv(val)
    except:
        pass
    try:
        test_csv = pd.read_csv(test)
    except:
        pass

    return train_csv, val_csv, test_csv

In [32]:
# Reading the data

train_csv_path = r"data/train.csv"
val_csv_path = r"data/dev.csv"
test_csv_path = r"data/test.csv"

train_txt_path = r"data/train.txt"
val_txt_path = r"data/dev.txt"
test_txt_path = r"data/test.txt"

paths = {
    train_txt_path: train_csv_path,
    val_txt_path: val_csv_path,
    test_txt_path: test_csv_path
}

zip_path = r"data/lang-8.zip"

if not (os.path.isfile(
    train_csv_path
) and os.path.isfile(
    val_csv_path
) and os.path.isfile(
    test_csv_path
)):
    create_train_dev_test_csvs(paths)

train_data, val_data, test_data = read_csvs(train_csv_path, val_csv_path, test_csv_path)

### Exploratory Data Analysis (EDA)

In [33]:
train_data = train_data.drop(columns=["Unnamed: 0", "filename"])
train_data

Unnamed: 0,text_length,lexical_density,average_sentence_length,average_word_length,oov_word_counts,reading_ease,punctuation_count,type_token_ratio,asian_context_feature,asian_imp_word,non_asian_imp_word,noun_counts,verb_counts,adjective_counts,target_region
0,281,0.480427,70.250000,4.259786,21,18.717015,18,0.61,0,0,1,79,41,11,0
1,29,0.655172,29.000000,4.448276,1,46.124138,1,0.22,0,0,0,5,7,5,0
2,307,0.592834,38.375000,4.566775,17,37.815320,20,0.77,0,1,0,95,52,14,1
3,67,0.611940,22.333333,4.104478,5,57.898010,13,0.50,0,1,0,18,15,6,1
4,25,0.520000,25.000000,3.520000,0,79.940000,2,0.18,0,0,1,4,7,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738,107,0.532710,53.500000,4.130841,6,29.981098,5,0.74,1,0,0,19,26,4,1
739,64,0.671875,32.000000,4.296875,0,48.776875,13,0.49,1,0,0,20,13,7,1
740,63,0.523810,21.000000,3.952381,3,70.034286,6,0.49,0,0,1,13,14,1,1
741,37,0.702703,37.000000,4.567568,2,29.804324,2,0.30,0,0,1,10,12,3,1


In [34]:
val_data = val_data.drop(columns=["Unnamed: 0", "filename"])
val_data

Unnamed: 0,text_length,lexical_density,average_sentence_length,average_word_length,oov_word_counts,reading_ease,punctuation_count,type_token_ratio,asian_context_feature,asian_imp_word,non_asian_imp_word,noun_counts,verb_counts,adjective_counts,target_region
0,699,0.595136,14.265306,4.214592,40,69.147131,87,0.70,0,1,1,158,157,51,1
1,618,0.600324,22.888889,4.872168,34,44.108603,61,0.70,0,1,1,173,88,53,0
2,83,0.590361,20.750000,4.421687,5,53.267726,13,0.67,1,1,0,20,17,9,1
3,32,0.625000,32.000000,4.406250,1,50.098750,3,0.27,1,0,0,7,9,2,0
4,70,0.585714,23.333333,4.828571,4,50.208810,14,0.51,1,1,0,25,13,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,73,0.602740,12.166667,3.849315,1,80.913231,7,0.53,0,0,1,12,16,8,1
242,88,0.579545,12.571429,3.829545,2,82.556818,13,0.63,0,1,0,19,19,10,1
243,37,0.648649,18.500000,4.810811,0,41.722365,9,0.33,0,0,0,13,8,3,1
244,322,0.562112,21.466667,4.760870,14,57.095402,59,0.81,0,1,1,79,53,39,1


In [None]:
test_data = test_data.drop(columns=["Unnamed: 0", "filename"])
test_data