In [348]:
import os 
dataset_name = "keywords_dataset.csv"
path_working_dir = "/content/drive/MyDrive/NLP/innoscripta"
path_data_dir = os.path.join(path_working_dir, "data")
path_dataset = os.path.join(path_data_dir, dataset_name)
print("Working directory:\n{}".format(os.listdir(path_working_dir)))

Working directory:
['data', 'models', 'Shebotnov_task_solution.rar']


# Looking at the data

In [349]:
import pandas as pd
df = pd.read_csv(path_dataset)
df.shape

(157137, 3)

In [350]:
df.head(8)

Unnamed: 0,Sentence #,Word,Tag;
0,112,Gebäudevermessung,B-KEY;
1,112,von,O;
2,112,Mehrfamilienhäusern,O;
3,"112,"","",O;",,
4,112,Einkaufszentren,O;
5,"112,"","",O;",,
6,112,Außenanlagen,O;
7,"112,"","",O;",,


In [351]:
len_nan = len(df[df.isna().any(axis=1)])
print("Number of lines in the dataset: {}".format(len(df)))
print("Number of lines with NaN values: {}".format(len_nan))

Number of lines in the dataset: 157137
Number of lines with NaN values: 4507


In [352]:
df[df.isna().any(axis=1)][0:5]

Unnamed: 0,Sentence #,Word,Tag;
3,"112,"","",O;",,
5,"112,"","",O;",,
7,"112,"","",O;",,
285,"2403,"","",O;",,
288,"2403,"","",O;",,


In [353]:
duplicates = df.pivot_table(index = ['Word'], aggfunc ='size') 
print(duplicates[0:5])

Word
!     586
#      11
%       6
&    1749
'       6
dtype: int64


In [354]:
print("Found {} duplicate entries of {} unique items".format(duplicates.sum(), len(duplicates)))

Found 152658 duplicate entries of 16911 unique items


# Fixing the CSV import

In [355]:
import re
data_list = []
with open(path_dataset, encoding="utf8") as fp:
    for cnt, line in enumerate(fp):
        line = re.sub('^"', '', line, 1)
        line = re.sub('";\n', ';\n', line, 1)
        line = re.sub(';\n', '\n', line, 1)
        line = re.sub('""', '"', line)
        data_list.append(line)

print("Processed {} lines of {}".format(cnt, path_dataset.split("/")[-1]))

Processed 157137 lines of keywords_dataset.csv


In [356]:
import os
dataset_cleaned_name = 'cleaned.csv'
path_dataset_cleaned = os.path.join(path_data_dir, dataset_cleaned_name)

with open(path_dataset_cleaned, 'w', encoding="utf8") as f:
    for cnt, item in enumerate(data_list):
        f.write(item)

print("Written {} lines to {}".format(cnt, path_dataset_cleaned))

Written 157137 lines to /content/drive/MyDrive/NLP/innoscripta/data/cleaned.csv


#Cleaning and fixing the CSV data

### Load CSV and rename

In [357]:
df_clean = pd.read_csv(path_dataset_cleaned, sep=',', engine='python', encoding="utf8")
df_clean.rename(columns={"Sentence #":"sentence_id", "Word":"words", "Tag":"labels"}, inplace = True)
df_clean

Unnamed: 0,sentence_id,words,labels
0,112,Gebäudevermessung,B-KEY
1,112,von,O
2,112,Mehrfamilienhäusern,O
3,112,",",O
4,112,Einkaufszentren,O
...,...,...,...
157132,3056,und,O
157133,3056,bereiten,O
157134,3056,ihn,O
157135,3056,auf,O


### Remove NaN values

In [358]:
len_nan_cleaned = len(df_clean[df_clean.isna().any(axis=1)])
print("Number of lines in the cleaned dataset: {}".format(len(df_clean)))
print("Number of lines with NaN values: {}".format(len_nan_cleaned))

Number of lines in the cleaned dataset: 157137
Number of lines with NaN values: 29


In [359]:
df_clean[df_clean.isna().any(axis=1)][0:10]

Unnamed: 0,sentence_id,words,labels
27992,99134,,O
34022,140618;,O,
34106,140624;,O,
34267,140632;,O,
34416,140638;,O,
34543,140644;,O,
34611,140656;,O,
37393,163643;,O,
104900,240;,O,
106846,351;,O,


Only 29 incorrectly parsed lines out of 157137. These lines are either white spaces or in the incorrect csv format.

In [360]:
df_clean.dropna(axis=0, inplace=True)
print("Number of lines with NaN values: {}".format(len(df_clean[df_clean.isna().any(axis=1)])))
print("Number of lines in the cleaned dataset: {}".format(len(df_clean)))
df_clean.shape

Number of lines with NaN values: 0
Number of lines in the cleaned dataset: 157108


(157108, 3)

In [361]:
df_clean.head(5)

Unnamed: 0,sentence_id,words,labels
0,112,Gebäudevermessung,B-KEY
1,112,von,O
2,112,Mehrfamilienhäusern,O
3,112,",",O
4,112,Einkaufszentren,O


### Look at duplicates

In [362]:
duplicates = df_clean[df_clean.duplicated(['sentence_id','words'], keep=False)]
duplicates.head(10)

Unnamed: 0,sentence_id,words,labels
0,112,Gebäudevermessung,B-KEY
3,112,",",O
5,112,",",O
7,112,",",O
10,112,Gebäudevermessung,B-KEY
285,2403,",",O
288,2403,",",O
331,2779,und,I-KEY
334,2779,für,O
336,2779,",",O


In [363]:
duplicates[duplicates.sentence_id == "2779"]

Unnamed: 0,sentence_id,words,labels
331,2779,und,I-KEY
334,2779,für,O
336,2779,",",O
338,2779,",",O
341,2779,und,O
347,2779,Personal,B-KEY
349,2779,für,O
354,2779,Personal,B-KEY
355,2779,",",O


In [364]:
duplicates[duplicates.sentence_id == "2779"].query('words == "und"')

Unnamed: 0,sentence_id,words,labels
331,2779,und,I-KEY
341,2779,und,O


Can't delete any duplicates, since even in the same sentence same words can have different tags

In [365]:
dups = df_clean.pivot_table(index=["sentence_id"], aggfunc='size')
print("Number of sentences in the data set: %d" % len(dups))

Number of sentences in the data set: 7170


In [366]:
def check_dupes(df,  col, count=5):
    dups = df.pivot_table(index=[col], aggfunc='size')
    len_dups = len(dups)

    print ("Number of all entries:\t{0}".format(len(df)).expandtabs(45))
    print ("Number of unique entries in '{0}':\t{1}".format(col, len_dups).expandtabs(45))
    print ("Number of duplicated entries in '{0}':\t{1}".format(col, dups.sum() - len_dups).expandtabs(45))

    dups_dict = dups.to_dict()
    sorted_words = sorted(dups_dict.items(), key=lambda x: int(x[1]), reverse=True)
    print("\nMost common duplicates in '%s':" % col)
    for i in range(count):
        print("{}\t{}".format(sorted_words[i][0],sorted_words[i][1]))

In [367]:
check_dupes(df_clean, "words", 5)
#df_fixed = df_clean.drop_duplicates(subset=['Word'])

Number of all entries:                       157108
Number of unique entries in 'words':         16928
Number of duplicated entries in 'words':     140180

Most common duplicates in 'words':
.	5524
,	4286
und	4218
&	1749
:	1593


### Type checking of "Sentence #" column

In [368]:
df_clean.dtypes

sentence_id    object
words          object
labels         object
dtype: object

In [369]:
df_clean.sentence_id = pd.to_numeric(df_clean.sentence_id, downcast='integer')
df_clean.dtypes

sentence_id     int32
words          object
labels         object
dtype: object

In [370]:
import numpy as np
num_of_int_sentence_id = df_clean.applymap(np.isreal)["sentence_id"].sum()
print(f"Number of integer entries in 'Sentence #': {num_of_int_sentence_id}")
print(f"Number of non integer entries in 'Sentence #': {len(df_clean) - num_of_int_sentence_id}")

Number of integer entries in 'Sentence #': 157108
Number of non integer entries in 'Sentence #': 0


In [371]:
print(df_clean.applymap(type))

          sentence_id          words         labels
0       <class 'int'>  <class 'str'>  <class 'str'>
1       <class 'int'>  <class 'str'>  <class 'str'>
2       <class 'int'>  <class 'str'>  <class 'str'>
3       <class 'int'>  <class 'str'>  <class 'str'>
4       <class 'int'>  <class 'str'>  <class 'str'>
...               ...            ...            ...
157132  <class 'int'>  <class 'str'>  <class 'str'>
157133  <class 'int'>  <class 'str'>  <class 'str'>
157134  <class 'int'>  <class 'str'>  <class 'str'>
157135  <class 'int'>  <class 'str'>  <class 'str'>
157136  <class 'int'>  <class 'str'>  <class 'str'>

[157108 rows x 3 columns]


### Drop rows consisting of numbers and special characters only

In [372]:
df_sorted = df_clean.copy()
df_sorted.index = df_sorted['words'].str.len()
print("Smallest and biggest words:")
df_sorted.sort_index(ascending=True).reset_index(drop=True)

Smallest and biggest words:


Unnamed: 0,sentence_id,words,labels
0,3056,.,O
1,2642,.,O
2,332122,&,O
3,93305,",",O
4,93305,",",O
...,...,...,...
157103,253020,unsSystemeOverallCouverQuattroElastixPremiumMo...,O
157104,253020,unsSystemeOverallCouverQuattroElastixPremiumMo...,O
157105,215004,PDFDruckenPDFAT099ATUMBSTSCGPTConvolexCRNDCPTD...,O
157106,2595,LebensmittelbereichenFassadenreinigungPolsterr...,O


Can't remove single character entries, some of them are useful labels

In [373]:
df_sorted[df_sorted.words=="&"][0:5]

Unnamed: 0_level_0,sentence_id,words,labels
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2995,&,I-KEY
1,3008,&,I-KEY
1,3014,&,O
1,3052,&,I-KEY
1,3093,&,I-KEY


In [374]:
print("Largest words:")
df_sorted.sort_index(ascending=False).reset_index(drop=True)[0:5]

Largest words:


Unnamed: 0,sentence_id,words,labels
0,2571,LebensmittelbereichenFassadenreinigungPolsterr...,O
1,2595,LebensmittelbereichenFassadenreinigungPolsterr...,O
2,215004,PDFDruckenPDFAT099ATUMBSTSCGPTConvolexCRNDCPTD...,O
3,253020,unsSystemeOverallCouverQuattroElastixPremiumMo...,O
4,253066,unsSystemeOverallCouverQuattroElastixPremiumMo...,O


# Saving pandas data frame

In [375]:
print(df_clean.dtypes)
print("\nShape: ", df_clean.shape, "\n")
df_clean.head(5)

sentence_id     int32
words          object
labels         object
dtype: object

Shape:  (157108, 3) 



Unnamed: 0,sentence_id,words,labels
0,112,Gebäudevermessung,B-KEY
1,112,von,O
2,112,Mehrfamilienhäusern,O
3,112,",",O
4,112,Einkaufszentren,O


In [376]:
df_name = "df_fixed.pkl"
path_df = os.path.join(path_data_dir, df_name)

In [377]:
df_clean.to_pickle(path_df)
print("Saved to: ", path_df)

Saved to:  /content/drive/MyDrive/NLP/innoscripta/data/df_fixed.pkl


Split and save train / test data frames

In [378]:
from sklearn.model_selection import train_test_split
df_clean = pd.read_pickle(path_df)

In [379]:
train_df, test_df = train_test_split(df_clean, test_size=0.20)
print('train shape: ', train_df.shape)
print('test shape: ', test_df.shape)

train shape:  (125686, 3)
test shape:  (31422, 3)


In [380]:
train_df_name = "train_df.pkl"
path_train_df = os.path.join(path_data_dir, train_df_name)
train_df.to_pickle(path_train_df)
print("Saved to: ", path_train_df)

Saved to:  /content/drive/MyDrive/NLP/innoscripta/data/train_df.pkl


In [381]:
test_df_name = "test_df.pkl"
path_test_df = os.path.join(path_data_dir, test_df_name)
test_df.to_pickle(path_test_df)
print("Saved to: ", path_test_df)

Saved to:  /content/drive/MyDrive/NLP/innoscripta/data/test_df.pkl
