# Dataset Cleaning
Here some approaches to cleaning a dataset will be applied and the result written to file for inspection.

In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_data = "./dataset/interview_signup.csv"
#Note: A converter is necessary when reading in the postcode data in order to ensure 
#      that any leading zeroes are preserved.
df = pd.read_csv(raw_data, converters={'postcode': lambda x: str(x)})

In [3]:
df['postcode'] = df['postcode'].astype(str).replace('\.0', '', regex=True)
df

Unnamed: 0,original_product_name,postcode,bundesland,total_bonus,order_date
0,E.ON STROM,53229,Nordrhein-Westfalen,146.0,2018-01-01
1,E.ON STROM ÖKO,74235,Baden-Württemberg,67.0,2018-01-01
2,E.ON STROM ÖKO 24,45257,Nordrhein-Westfalen,123.0,2018-01-01
3,E.ON STROM,64395,Hessen,159.0,2018-01-01
4,E.ON STROM 24,36039,Hessen,116.0,2018-01-01
...,...,...,...,...,...
318340,E.ON STROM 24,49610,Niedersachsen,147.0,2018-12-31
318341,E.ON STROM,4916,,142.0,2018-12-31
318342,E.ON STROM,77972,Baden-Württemberg,48.0,2018-12-31
318343,E.ON STROM ÖKO 24,91466,Bayern,238.0,2018-12-31


Make use of the zfill() helper method to left-pad any string, integer or float with zeros

In [4]:
df['postcode']=[x.zfill(5) for x in df['postcode']]
df

Unnamed: 0,original_product_name,postcode,bundesland,total_bonus,order_date
0,E.ON STROM,53229,Nordrhein-Westfalen,146.0,2018-01-01
1,E.ON STROM ÖKO,74235,Baden-Württemberg,67.0,2018-01-01
2,E.ON STROM ÖKO 24,45257,Nordrhein-Westfalen,123.0,2018-01-01
3,E.ON STROM,64395,Hessen,159.0,2018-01-01
4,E.ON STROM 24,36039,Hessen,116.0,2018-01-01
...,...,...,...,...,...
318340,E.ON STROM 24,49610,Niedersachsen,147.0,2018-12-31
318341,E.ON STROM,04916,,142.0,2018-12-31
318342,E.ON STROM,77972,Baden-Württemberg,48.0,2018-12-31
318343,E.ON STROM ÖKO 24,91466,Bayern,238.0,2018-12-31


In [5]:
# import a helper for working with PLZ data
import utility.plz_helper as plz_helper

#Create a mask by applying list comprehension to check the validity of each PLZ in the postcode column
mask = [plz_helper.check_invalid_plz_format(str(x)) for x in df['postcode']]
result_df = df[mask]
result_df

Unnamed: 0,original_product_name,postcode,bundesland,total_bonus,order_date
266922,E.ON STROM,92696JAVAS,,97.0,2018-11-03


In [6]:
import re

df['postcode']=[re.sub("[^0-9]", "", x) for x in df['postcode']]

In [7]:
#Create a mask by applying list comprehension to check the validity of each PLZ in the postcode column
mask = [plz_helper.check_invalid_plz_format(str(x)) for x in df['postcode']]
result_df = df[mask]
result_df

Unnamed: 0,original_product_name,postcode,bundesland,total_bonus,order_date


Hurrah!! Empty DataFrame, hence nothing more to do in terms of ensuring that the postcode data contains 5 digits only.

## Missing values
In order to tackle the missing values for the **bundesland** feature the quickest fix would be to map the PLZ in the postcode column to the required value, i.e. Bundesland.

For that we would require a list of PLZ data with Bundesland. Luckily you can find this on the internet!

Here we will use a csv file obtained from this gist on github: https://gist.github.com/jbspeakr/4565964

It's been forked and some data had to be corrected (Schleswig-Holstein had been mispelled). 

The forked and corrected version used to clean the data can be found here: https://gist.github.com/remuant/7c8f759ae4581e0bb24c6f83808d29fb. It has also been included in this respository in the **resources** folder.

In [60]:
plz_csv_path = './resources/German-Zip-Codes.csv'
plz_data = pd.read_csv(plz_csv_path, sep=';', converters={'Plz': lambda x: str(x)})

In [61]:
plz_data

Unnamed: 0,Ort,Zusatz,Plz,Vorwahl,Bundesland
0,Aach,b Trier,54298,651.0,Rheinland-Pfalz
1,Aach,", Hegau",78267,7774.0,Baden-Württemberg
2,Aachen,,52062,241.0,Nordrhein-Westfalen
3,Aachen,,52064,241.0,Nordrhein-Westfalen
4,Aachen,,52066,241.0,Nordrhein-Westfalen
...,...,...,...,...,...
19672,Zwingenberg,", Baden",69439,6263.0,Baden-Württemberg
19673,Zwischendeich,,19322,3877.0,Brandenburg
19674,Zwochau,b Delitzsch,04509,34207.0,Sachsen
19675,Zwönitz,,08297,37754.0,Sachsen


In [62]:
plz_data['Bundesland'].unique()

array(['Rheinland-Pfalz', 'Baden-Württemberg', 'Nordrhein-Westfalen',
       'Hessen', 'Schleswig-Holstein', 'Sachsen-Anhalt', 'Brandenburg',
       'Bayern', 'Sachsen', 'Thüringen', 'Niedersachsen',
       'Mecklenburg-Vorpommern', 'Saarland', 'Berlin', 'Bremen',
       'Hamburg'], dtype=object)

We are going to create a dictionary using the Plz and Bundesland columns.

The **PLZ data will form the key** and the **Bundesland will be the value**.

The dictionary can then be used to map PLZ data to a Bundesland

In [63]:
my_plz_dict = dict(zip(plz_data.Plz, plz_data.Bundesland))
my_plz_dict

{'54298': 'Rheinland-Pfalz',
 '78267': 'Baden-Württemberg',
 '52062': 'Nordrhein-Westfalen',
 '52064': 'Nordrhein-Westfalen',
 '52066': 'Nordrhein-Westfalen',
 '52068': 'Nordrhein-Westfalen',
 '52070': 'Nordrhein-Westfalen',
 '52072': 'Nordrhein-Westfalen',
 '52074': 'Nordrhein-Westfalen',
 '52076': 'Nordrhein-Westfalen',
 '52078': 'Nordrhein-Westfalen',
 '52080': 'Nordrhein-Westfalen',
 '73430': 'Baden-Württemberg',
 '73431': 'Baden-Württemberg',
 '73432': 'Baden-Württemberg',
 '73433': 'Baden-Württemberg',
 '73434': 'Baden-Württemberg',
 '65326': 'Rheinland-Pfalz',
 '25560': 'Schleswig-Holstein',
 '29416': 'Sachsen-Anhalt',
 '19322': 'Brandenburg',
 '38871': 'Sachsen-Anhalt',
 '06543': 'Sachsen-Anhalt',
 '91183': 'Bayern',
 '01623': 'Sachsen',
 '93326': 'Bayern',
 '55767': 'Rheinland-Pfalz',
 '94036': 'Bayern',
 '91720': 'Bayern',
 '74232': 'Baden-Württemberg',
 '06628': 'Sachsen-Anhalt',
 '99713': 'Thüringen',
 '06888': 'Sachsen-Anhalt',
 '73453': 'Baden-Württemberg',
 '69518': 'Hes

In [64]:
df['bundesland_full']=[my_plz_dict[y] if pd.isnull(x) else x for x, y in zip(df['bundesland'], df['postcode'])]
df

Unnamed: 0,original_product_name,postcode,bundesland,total_bonus,order_date,bundesland_full
0,E.ON STROM,53229,Nordrhein-Westfalen,146.0,2018-01-01,Nordrhein-Westfalen
1,E.ON STROM ÖKO,74235,Baden-Württemberg,67.0,2018-01-01,Baden-Württemberg
2,E.ON STROM ÖKO 24,45257,Nordrhein-Westfalen,123.0,2018-01-01,Nordrhein-Westfalen
3,E.ON STROM,64395,Hessen,159.0,2018-01-01,Hessen
4,E.ON STROM 24,36039,Hessen,116.0,2018-01-01,Hessen
...,...,...,...,...,...,...
318340,E.ON STROM 24,49610,Niedersachsen,147.0,2018-12-31,Niedersachsen
318341,E.ON STROM,04916,,142.0,2018-12-31,Brandenburg
318342,E.ON STROM,77972,Baden-Württemberg,48.0,2018-12-31,Baden-Württemberg
318343,E.ON STROM ÖKO 24,91466,Bayern,238.0,2018-12-31,Bayern


In [65]:
df.isnull().sum()

original_product_name        0
postcode                     0
bundesland               29532
total_bonus                  0
order_date                   0
bundesland_full              0
dtype: int64

In [66]:
df['bundesland_full'].unique()

array(['Nordrhein-Westfalen', 'Baden-Württemberg', 'Hessen', 'Berlin',
       'Schleswig-Holstein', 'Niedersachsen', 'Sachsen-Anhalt', 'Bayern',
       'Rheinland-Pfalz', 'Sachsen', 'Thüringen', 'Bremen', 'Brandenburg',
       'Saarland', 'Hamburg', 'Mecklenburg-Vorpommern'], dtype=object)

In [12]:
#If you'd like all elements in A not in B:
list(set(df['postcode']) - set(plz_data['Plz']))

['98694',
 '99707',
 '39217',
 '24222',
 '39628',
 '24976',
 '99095',
 '60312',
 '04861',
 '99820',
 '06711',
 '06889',
 '99090',
 '64760',
 '06772',
 '06861',
 '99331',
 '83416',
 '73278',
 '98630',
 '15713',
 '06868',
 '15712',
 '08344',
 '04862']

In [67]:
tarif_dict = dict()
tarif_dict['E.ON STROM']=('E.ON STROM')
tarif_dict['E.ON STROM ÖKO']=('E.ON STROM ÖKO','E.ON STROM Ã–KO','E.ON STROM ÖO')
tarif_dict['E.ON STROM ÖKO 24']=('E.ON STROM ÖKO 24','Blackberry','Blueberry')
tarif_dict['E.ON STROM 24']=('E.ON STROM 24','E.ON STROM 24 24 24','E.ON STROM 24 24','E.ON STROM 24 24 24 24 24 24 24')
tarif_dict['E.ON STROM PUR']=('E.ON STROM PUR')
tarif_dict

{'E.ON STROM': 'E.ON STROM',
 'E.ON STROM ÖKO': ('E.ON STROM ÖKO', 'E.ON STROM Ã–KO', 'E.ON STROM ÖO'),
 'E.ON STROM ÖKO 24': ('E.ON STROM ÖKO 24', 'Blackberry', 'Blueberry'),
 'E.ON STROM 24': ('E.ON STROM 24',
  'E.ON STROM 24 24 24',
  'E.ON STROM 24 24',
  'E.ON STROM 24 24 24 24 24 24 24'),
 'E.ON STROM PUR': 'E.ON STROM PUR'}

In [73]:
#print ([tarif_dict.values().index('E.ON STROM Ã–KO')])
#print(list(tarif_dict.keys())[list(tarif_dict.values()).index('E.ON STROM Ã–KO')])
key = [k for k, v in tarif_dict.items() if v == 'E.ON STROM Ã–KO']
print(key)

[]


In [74]:
tarif_dict.items()

dict_items([('E.ON STROM', 'E.ON STROM'), ('E.ON STROM ÖKO', ('E.ON STROM ÖKO', 'E.ON STROM Ã–KO', 'E.ON STROM ÖO')), ('E.ON STROM ÖKO 24', ('E.ON STROM ÖKO 24', 'Blackberry', 'Blueberry')), ('E.ON STROM 24', ('E.ON STROM 24', 'E.ON STROM 24 24 24', 'E.ON STROM 24 24', 'E.ON STROM 24 24 24 24 24 24 24')), ('E.ON STROM PUR', 'E.ON STROM PUR')])

In [75]:
value = 'E.ON STROM Ã–KO'
list_of_keys = [key
                for key, list_of_values in tarif_dict.items()
                if value in list_of_values]

if list_of_keys:
    print(list_of_keys)
else:
    print('Value does not exist in the dictionary')

['E.ON STROM ÖKO']


In [76]:
list_of_keys[0]

'E.ON STROM ÖKO'

In [80]:
def get_key_from_value(value):
    list_of_keys = [key
                for key, list_of_values in tarif_dict.items()
                if value in list_of_values]
    if list_of_keys:
        return list_of_keys[0]
    else:
        return ''

In [81]:
df['original_product_name_corrected'] = [get_key_from_value(x) for x in df['original_product_name']]

In [82]:
df

Unnamed: 0,original_product_name,postcode,bundesland,total_bonus,order_date,bundesland_full,original_product_name_corrected
0,E.ON STROM,53229,Nordrhein-Westfalen,146.0,2018-01-01,Nordrhein-Westfalen,E.ON STROM
1,E.ON STROM ÖKO,74235,Baden-Württemberg,67.0,2018-01-01,Baden-Württemberg,E.ON STROM ÖKO
2,E.ON STROM ÖKO 24,45257,Nordrhein-Westfalen,123.0,2018-01-01,Nordrhein-Westfalen,E.ON STROM ÖKO 24
3,E.ON STROM,64395,Hessen,159.0,2018-01-01,Hessen,E.ON STROM
4,E.ON STROM 24,36039,Hessen,116.0,2018-01-01,Hessen,E.ON STROM 24
...,...,...,...,...,...,...,...
318340,E.ON STROM 24,49610,Niedersachsen,147.0,2018-12-31,Niedersachsen,E.ON STROM 24
318341,E.ON STROM,04916,,142.0,2018-12-31,Brandenburg,E.ON STROM
318342,E.ON STROM,77972,Baden-Württemberg,48.0,2018-12-31,Baden-Württemberg,E.ON STROM
318343,E.ON STROM ÖKO 24,91466,Bayern,238.0,2018-12-31,Bayern,E.ON STROM ÖKO 24


In [83]:
df['original_product_name_corrected'].unique()

array(['E.ON STROM', 'E.ON STROM ÖKO', 'E.ON STROM ÖKO 24',
       'E.ON STROM 24', 'E.ON STROM PUR'], dtype=object)