## creating and cleaning dataset - `definitions.csv`
***

1. [creating dataset - definitions.csv](#creating-definitions.csv)
2. [drop words without definitions and definitions without words](#drop-words-without-definitions-and-definitions-without-words)
3. [extract words without definition from dataset and save to missing-words.csv](#extract-words-from-definitions-which-has-no-definition-in-dataset-and-save-to-missing-words.csv)

### creating definitions.csv

In [1]:
import pandas as pd
from lib.connect_db import get_engine
from lib.print_error import print_error
from nltk.corpus import stopwords

In [None]:
engine, meta = get_engine()

df = pd.read_sql("definitions", engine)
df.to_csv("db/definitions.csv", sep=":", columns = ["word", "definition"], header = False, index=False, mode = "w")

### drop words without definitions and definitions without words

In [None]:
df = pd.read_csv("db/definitions.csv", names=["word","definition"], sep=":", index_col=None, keep_default_na=False, na_values=[""])
words = pd.DataFrame(columns=["word"], index=None)

ww = pd.isnull(df['word'])

null_word_indices = []
for id,w in enumerate(ww):
    if w is True:
        null_word_indices.append(id)

dd = pd.isnull(df['definition'])
null_def_indices = []
for id,d in enumerate(dd):
    if d is True:
        null_def_indices.append(id)

for el in null_word_indices:
    if el in null_def_indices:
        null_def_indices.remove(el)

print("df info before deletion")
print(df.info())

df = df.drop(null_word_indices, inplace=False)
df = df.drop(null_def_indices, inplace=False)
df = df.reset_index(drop=True)

print("df info after deletion")
print("*"*10)
print(df.info())

df.to_csv("db/definitions.csv", sep=":", columns=["word","definition"], header=False, index=False, mode="w")


### extract words from definitions which has no definition in dataset and save to missing-words.csv

In [2]:
df = pd.read_csv("db/definitions.csv", names=["word","definition"], sep=":", index_col=None, keep_default_na=False, na_values=[""])

print(df.info())

missing = []

def missing_words(definition):
    missing.append(str(d) for d in str(definition).split() if not any(df["word"] == str(d)))


df["definition"].apply(missing_words)

print(missing[:200])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117631 entries, 0 to 117630
Data columns (total 2 columns):
word          117631 non-null object
definition    117631 non-null object
dtypes: object(2)
memory usage: 1.8+ MB
None
[<generator object missing_words.<locals>.<genexpr> at 0x000001EC00D97FC0>, <generator object missing_words.<locals>.<genexpr> at 0x000001EC0C2F30F8>, <generator object missing_words.<locals>.<genexpr> at 0x000001EC0C2F30A0>, <generator object missing_words.<locals>.<genexpr> at 0x000001EC0C2F31A8>, <generator object missing_words.<locals>.<genexpr> at 0x000001EC0C2F3200>, <generator object missing_words.<locals>.<genexpr> at 0x000001EC0C2F3258>, <generator object missing_words.<locals>.<genexpr> at 0x000001EC0C2F32B0>, <generator object missing_words.<locals>.<genexpr> at 0x000001EC0C2F3308>, <generator object missing_words.<locals>.<genexpr> at 0x000001EC0C2F3360>, <generator object missing_words.<locals>.<genexpr> at 0x000001EC0C2F33B8>, <generator object mi