# __creating and cleaning dataset - definitions.csv__
1. [creating dataset - definitions.csv](#creating-definitions.csv)
2. [drop words without definitions and definitions without words](#drop-words-without-definitions-and-definitions-without-words)
3. [remove special characters](#remove-special-characters)
    * [retrieve special characters](#retrieve-special-characters)
    * [delete special characters](#delete-special-characters)
4. [extract words without definition from dataset and save to missing-words.csv](#extract-words-from-definitions-which-have-no-definition-in-dataset-and-save-to-missing-words.csv)

### creating definitions.csv
---

In [2]:
import pandas as pd
from lib.connect_db import get_engine
from lib.print_error import print_error
from nltk.corpus import stopwords

In [None]:
engine, meta = get_engine()

df = pd.read_sql("definitions", engine)
df.to_csv("db/definitions.csv", sep=":", columns = ["word", "definition"], header = False, index=False, mode = "w")

### drop words without definitions and definitions without words
---

In [None]:
df = pd.read_csv("db/definitions.csv", names=["word","definition"], sep=":", index_col=None, keep_default_na=False, na_values=[""])
words = pd.DataFrame(columns=["word"], index=None)

ww = pd.isnull(df['word'])

null_word_indices = []
for id,w in enumerate(ww):
    if w is True:
        null_word_indices.append(id)

dd = pd.isnull(df['definition'])
null_def_indices = []
for id,d in enumerate(dd):
    if d is True:
        null_def_indices.append(id)

for el in null_word_indices:
    if el in null_def_indices:
        null_def_indices.remove(el)

print("df info before deletion")
print(df.info())

df = df.drop(null_word_indices, inplace=False)
df = df.drop(null_def_indices, inplace=False)
df = df.reset_index(drop=True)

print("df info after deletion")
print("*"*10)
print(df.info())

df.to_csv("db/definitions.csv", sep=":", columns=["word","definition"], header=False, index=False, mode="w")


### remove special characters
---
this section is divided into two parts -
* retrieve special characters
* delete special characters

#### retrieve special characters

In [None]:
df = pd.read_csv("db//definitions.csv", names=['word','definition'], sep=":", index_col=None, keep_default_na=False, na_values=[""])

special_chars = []


def get_special_chars(defi):
    for d in list(defi):
        if (ord(d) < 48 and ord(d) != 32) or (ord(d) > 57 and ord(d)<65) or (ord(d) >90 and ord(d) < 97) or ord(d) > 122:
            if d not in special_chars:
                special_chars.append(d)
    return

df["definition"].apply(get_special_chars)

special_chars.remove(",")
special_chars.remove(";")

print(special_chars)


#### delete special characters

In [None]:
df = pd.read_csv("db/definitions.csv", names=['word','definition'], sep=":", index_col=None, keep_default_na=False, na_values=[""])

def remove_special_chars(defi):
    for s in special_chars:
        defi = " ".join(defi.split(s))
    index = 0
    defi = list(defi)
    while index < len(defi)-1:
        if defi[index] == " " and defi[index+1] == " ":
            del defi[index]
        else:
            index += 1
    return "".join(defi)


df["definition"] = df["definition"].apply(remove_special_chars)
df.to_csv("db/definitions.csv", sep=":", columns=["word","definition"], header=False, index=False, mode="w")

###  extract words from column definition which have no definition in dataset; save to missing-words.csv
---

In [29]:
df = pd.read_csv("db/definitions.csv", names=["word","definition"], sep=":", index_col=None, keep_default_na=False, na_values=[""])

print(df.info())
print("*"*20)
df = df[:10]

missing_data = []
count = 0

def missing_words(definition):
    missing_data.append(list(d for d in str(definition).split() if not any(df["word"] == d) and d not in missing_data))
    
df["definition"].apply(missing_words)

final_data= []
for missing in missing_data:
    for m in missing:
        final_data.append(m)

print("total words obtained:", len(final_data))

defi_df = pd.DataFrame({"word":final_data}, columns = ["word"], index=None)
defi_df.to_csv("db/missing-defs.csv", sep=":", columns=["word"], header=False, index=False, mode="a")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117631 entries, 0 to 117630
Data columns (total 2 columns):
word          117631 non-null object
definition    117631 non-null object
dtypes: object(2)
memory usage: 1.8+ MB
None
********************
total words obtained: 60
