# Importing and preparing rental apartments data

## Libraries and settings

In [1]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

/workspaces/DA_Project/Project/Minimum/1


## Importing data

In [2]:
# Show .csv - files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('comparis_bmw.csv', sep=',', encoding='utf-8')

# Show first records of data frame
df.head()

comparis_bmw.csv


Unnamed: 0,web-scraper-order,web-scraper-start-url,text_raw,price_raw,model_raw,type_year_km_transmission_fuel
0,1704481912-1,https://www.comparis.ch/carfinder/marktplatz/b...,BMW 220 Gran Tourer 220d xDrive Gran Tourer St...,CHF 21'000,220d xDrive Gran Tourer Steptronic,Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1,1704481912-2,https://www.comparis.ch/carfinder/marktplatz/b...,BMW X5M X5M SteptronicCHF 38'500ab CHF 616 / M...,CHF 38'500,X5M Steptronic,Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2,1704481912-3,https://www.comparis.ch/carfinder/marktplatz/b...,BMW X3 X3 M40d Individual SteptronicCHF 44'990...,CHF 44'990,X3 M40d Individual Steptronic,Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3,1704481912-4,https://www.comparis.ch/carfinder/marktplatz/b...,BMW X5 X5 xDrive 30d SteptronicCHF 26'500ab CH...,CHF 26'500,X5 xDrive 30d Steptronic,Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4,1704481912-5,https://www.comparis.ch/carfinder/marktplatz/b...,BMW 320 320i Cabriolet SteptronicCHF 16'750ab ...,CHF 16'750,320i Cabriolet Steptronic,Occasion06.201079.600 kmAutomatBenzin4461 (BL)


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (50, 6)
Number of rows: 50
Number of columns: 6


## Get data types (raw-format from web scraping)

In [4]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order                 object
web-scraper-start-url             object
text_raw                          object
price_raw                         object
model_raw                         object
type_year_km_transmission_fuel    object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Output type_year_km_transmission_fuel

In [5]:
print(df['type_year_km_transmission_fuel'].head(5), '\n')

0    Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1    Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2     Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3    Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4     Occasion06.201079.600 kmAutomatBenzin4461 (BL)
Name: type_year_km_transmission_fuel, dtype: object 



## Extract and save relevant information from raw data using regular expressions (regex)

### Extract Location

In [6]:
# Extrahiere Location
Location = []
for item in df['type_year_km_transmission_fuel']:
    location = re.findall(r'\((.*?)\)', item)
    try:
        Location.append(location[0])
    except:
        Location.append(None)

# Füge Location als neue Spalte hinzu
df['Location'] = pd.Series(Location, dtype='str')

# Zeige die ersten 5 Werte 
print(df['type_year_km_transmission_fuel'].head(5), '\n')
print(df['Location'].head(5))

0    Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1    Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2     Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3    Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4     Occasion06.201079.600 kmAutomatBenzin4461 (BL)
Name: type_year_km_transmission_fuel, dtype: object 

0    VD
1    SG
2    FR
3    BL
4    BL
Name: Location, dtype: object


### Extract PLZ

In [7]:
# Extrahiere PLZ
PLZ = []
for item in df['type_year_km_transmission_fuel']:
    plz = re.search(r'(\d{4}) \(([^)]+)\)', item)
    if plz:
        PLZ.append(plz.group(1))
    else:
        PLZ.append(None)

# Füge PLZ als neue Spalte hinzu
df['PLZ'] = pd.Series(PLZ, dtype="Int64")

# Zeige die ersten 5 Werte
print(df['type_year_km_transmission_fuel'].head(5), '\n')
print(df['PLZ'].head(5))

0    Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1    Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2     Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3    Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4     Occasion06.201079.600 kmAutomatBenzin4461 (BL)
Name: type_year_km_transmission_fuel, dtype: object 

0    1299
1    8730
2    3178
3    4461
4    4461
Name: PLZ, dtype: Int64


### Extract km

In [8]:
# Extrahiere km
KM = []
for item in df['type_year_km_transmission_fuel']:
    km_raw = re.search(r'.\d{4}(\d+\.\d+)', item)
    if km_raw:
        KM.append(km_raw.group(1))
    else:
        KM.append(None)

# Füge Year als neue Spalte hinzu
df['km_raw'] = pd.Series(KM, dtype="object")

# Zeige die ersten 5 Werte
print(df['type_year_km_transmission_fuel'].head(5), '\n')
print(df['km_raw'].head(5))

0    Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1    Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2     Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3    Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4     Occasion06.201079.600 kmAutomatBenzin4461 (BL)
Name: type_year_km_transmission_fuel, dtype: object 

0    107.000
1    171.000
2     88.900
3    164.900
4     79.600
Name: km_raw, dtype: object


### Extract year

In [9]:
# Extrahiere year
YEAR = []
for item in df['type_year_km_transmission_fuel']:
    year = re.search(r'.(\d{4})', item)
    if year:
        YEAR.append(year.group(1))
    else:
        YEAR.append(None)

# Füge Year als neue Spalte hinzu
df['Year'] = pd.Series(YEAR, dtype="Int64")

# Zeige die ersten 5 Werte
print(df['type_year_km_transmission_fuel'].head(5), '\n')
print(df['Year'].head(5))

0    Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1    Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2     Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3    Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4     Occasion06.201079.600 kmAutomatBenzin4461 (BL)
Name: type_year_km_transmission_fuel, dtype: object 

0    2017
1    2016
2    2019
3    2016
4    2010
Name: Year, dtype: Int64


### Extraction of all Information

In [10]:
# Extrahieren von Typ, Jahr, Kilometerstand, Getriebeart, Treibstofftyp, Postleitzahl und Location
extracted = df['type_year_km_transmission_fuel'].str.extract(r'(\w+)(\d{2}.\d{4})(\d+\.\d+ km)([A-Z]\w+)([A-Z]\w+)(\d{4}) \(([^)]+)\)')
extracted.columns = ['Type', 'Year', 'Kilometers', 'Transmission', 'Fuel', 'PLZ', 'Location']

# Ausgabe der ersten 5 extrahierten Datensätze
print(extracted.head())

       Type     Year  Kilometers Transmission    Fuel   PLZ Location
0  Occasion  10.2017  107.000 km      Automat  Diesel  1299       VD
1  Occasion  08.2016  171.000 km      Automat  Benzin  8730       SG
2  Occasion  12.2019   88.900 km      Automat  Diesel  3178       FR
3  Occasion  11.2016  164.900 km      Automat  Diesel  4461       BL
4  Occasion  06.2010   79.600 km      Automat  Benzin  4461       BL


### Output price_raw

In [11]:
print(df['price_raw'].head(5), '\n')

0    CHF 21'000
1    CHF 38'500
2    CHF 44'990
3    CHF 26'500
4    CHF 16'750
Name: price_raw, dtype: object 



In [12]:
# Verwende str.extract(), um Zahlen aus der 'price_raw'-Spalte zu extrahieren
df['price'] = df['price_raw'].str.extract(r"(\d+['\d+]*)", expand=False)
df['price'] = df['price'].str.replace("'", "").astype('Int64')  # Entferne Apostrophe und ändere den Datentyp in 'Int64'

# Gib die ersten 5 Werte der 'price_raw'- und 'price'-Spalten aus
print(df['price_raw'].head(5), '\n')
print(df['price'].head(5))

0    CHF 21'000
1    CHF 38'500
2    CHF 44'990
3    CHF 26'500
4    CHF 16'750
Name: price_raw, dtype: object 

0    21000
1    38500
2    44990
3    26500
4    16750
Name: price, dtype: Int64


### Output km_raw

In [13]:
print(df['km_raw'].head(5), '\n')

0    107.000
1    171.000
2     88.900
3    164.900
4     79.600
Name: km_raw, dtype: object 



In [14]:
# Verwende str.extract(), um Zahlen aus der 'km'-Spalte zu extrahieren
df['km'] = df['km_raw'].str.extract(r"(\d+\.\d+)", expand=False)
df['km'] = df['km_raw'].str.replace(".", "").astype('Int64')  # Entferne Apostrophe und ändere den Datentyp in 'Int64'

# Gib die ersten 5 Werte der 'km'- und 'km'-Spalten aus
print(df['km_raw'].head(5), '\n')
print(df['km'].head(5))

0    107.000
1    171.000
2     88.900
3    164.900
4     79.600
Name: km_raw, dtype: object 

0    107000
1    171000
2     88900
3    164900
4     79600
Name: km, dtype: Int64


### Get data types of all variables including the new ones

In [15]:
df.dtypes

web-scraper-order                 object
web-scraper-start-url             object
text_raw                          object
price_raw                         object
model_raw                         object
type_year_km_transmission_fuel    object
Location                          object
PLZ                                Int64
km_raw                            object
Year                               Int64
price                              Int64
km                                 Int64
dtype: object

## Create additional variables

### Change string 'text_raw' to uppercase 

In [16]:
# Change strings to uppercase

df['text_raw'] = df['text_raw'].str.upper()
print(df['text_raw'].head(10))

0    BMW 220 GRAN TOURER 220D XDRIVE GRAN TOURER ST...
1    BMW X5M X5M STEPTRONICCHF 38'500AB CHF 616 / M...
2    BMW X3 X3 M40D INDIVIDUAL STEPTRONICCHF 44'990...
3    BMW X5 X5 XDRIVE 30D STEPTRONICCHF 26'500AB CH...
4    BMW 320 320I CABRIOLET STEPTRONICCHF 16'750AB ...
5    BMW 525 525D TOURING XDRIVE LUXURY LINE STEPTR...
6    BMW X1 X1 XDRIVE 25D XLINE STEPTRONICCHF 24'90...
7    BMW X5 X5 XDRIVE 30D (3.0D) STEPTRONICCHF 8'90...
8    BMW M4 M4 CABRIOLET DRIVELOGIC M COMPETITIONCH...
9    BMW M135 M135I XDRIVE STEPTRONICCHF 36'990AB C...
Name: text_raw, dtype: object


In [17]:
# Show first item of variable 'text_raw'
print(df['text_raw'][0])

BMW 220 GRAN TOURER 220D XDRIVE GRAN TOURER STEPTRONICCHF 21'000AB CHF 336 / MONAT4.5 / 6OCCASION10.2017107.000 KMAUTOMATDIESEL1299 (VD)PREISCHF 21'000FINANZIERUNGAB CHF 336 / MONAT4.5 / 6ANFRAGEN MERKENDETAILS


### Create new binary variables (0/1) luxury, competition, xDrive, automat

In [18]:
# Create patterns which can be used to search the variable 'text_raw'
patternLuxury = '(LUXURY)'
patternCompetition = '(COMPETITION)'
patternXdrive = '(XDRIVE)'
patternAutomat = '(AUTOMAT)'

# Create new variables as binary dummy (0/1) variable
# text_raw beinhaltet mehr Informationen
df['luxury'] = df['text_raw'].str.contains(pat = patternLuxury).astype(int)
df['competition'] = df['text_raw'].str.contains(pat = patternCompetition).astype(int)
df['xDrive'] = df['text_raw'].str.contains(pat = patternXdrive).astype(int)
df['automat'] = df['text_raw'].str.contains(pat = patternAutomat).astype(int)

print(df['luxury'].sum())
print(df['competition'].sum())
print(df['xDrive'].sum())
print(df['automat'].sum())

# Show values
df[['text_raw','luxury', 'competition','xDrive','automat']]

5
5
30
50


Unnamed: 0,text_raw,luxury,competition,xDrive,automat
0,BMW 220 GRAN TOURER 220D XDRIVE GRAN TOURER ST...,0,0,1,1
1,BMW X5M X5M STEPTRONICCHF 38'500AB CHF 616 / M...,0,0,0,1
2,BMW X3 X3 M40D INDIVIDUAL STEPTRONICCHF 44'990...,0,0,0,1
3,BMW X5 X5 XDRIVE 30D STEPTRONICCHF 26'500AB CH...,0,0,1,1
4,BMW 320 320I CABRIOLET STEPTRONICCHF 16'750AB ...,0,0,0,1
5,BMW 525 525D TOURING XDRIVE LUXURY LINE STEPTR...,1,0,1,1
6,BMW X1 X1 XDRIVE 25D XLINE STEPTRONICCHF 24'90...,0,0,1,1
7,BMW X5 X5 XDRIVE 30D (3.0D) STEPTRONICCHF 8'90...,0,0,1,1
8,BMW M4 M4 CABRIOLET DRIVELOGIC M COMPETITIONCH...,0,1,0,1
9,BMW M135 M135I XDRIVE STEPTRONICCHF 36'990AB C...,0,0,1,1


### Create new categorical variable based on km

In [19]:
# Define classes (labels)
labels = ['0-50k', '50-120k', '120k+']

# Use the .cut method from pandas to divide the numeric values in classes
df["km_cat"] = pd.cut(df['km'], bins=[0, 50000, 120000, 500000], labels=labels)

# Show original data and classes
df[['km', 'km_cat']].head(10)

Unnamed: 0,km,km_cat
0,107000,50-120k
1,171000,120k+
2,88900,50-120k
3,164900,120k+
4,79600,50-120k
5,110700,50-120k
6,90100,50-120k
7,250000,120k+
8,59900,50-120k
9,33600,0-50k


### Create new numeric variable 'km_per_year'

In [20]:
# Create the new variable
df['km_per_year'] = round(df['km'] / (2024-df['Year']), 2)

# Show values
df[['Year','km','price','km_per_year']].head(10)

Unnamed: 0,Year,km,price,km_per_year
0,2017,107000,21000,15285.71
1,2016,171000,38500,21375.0
2,2019,88900,44990,17780.0
3,2016,164900,26500,20612.5
4,2010,79600,16750,5685.71
5,2014,110700,21500,11070.0
6,2017,90100,24900,12871.43
7,2007,250000,8900,14705.88
8,2019,59900,63890,11980.0
9,2020,33600,36990,8400.0


### Create new categorical variable based on km

In [21]:
# Define classes (labels)
labels = ['low', 'medium', 'high']

# Use the .cut method from pandas to divide the numeric values in classes
df["km_per_year_cat"] = pd.cut(df['km_per_year'], bins=[0, 10000, 15000, 50000], labels=labels)

# Show original data and classes
df[['km_per_year','km_cat','km_per_year_cat']].head(10)

Unnamed: 0,km_per_year,km_cat,km_per_year_cat
0,15285.71,50-120k,high
1,21375.0,120k+,high
2,17780.0,50-120k,high
3,20612.5,120k+,high
4,5685.71,50-120k,low
5,11070.0,50-120k,medium
6,12871.43,50-120k,medium
7,14705.88,120k+,medium
8,11980.0,50-120k,medium
9,8400.0,0-50k,low


## Count and identify missing values (if any)

It seems like all the rows in the DataFrame df have complete data without any missing values

In [22]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)].head()

web-scraper-order                 0
web-scraper-start-url             0
text_raw                          0
price_raw                         0
model_raw                         0
type_year_km_transmission_fuel    0
Location                          0
PLZ                               0
km_raw                            0
Year                              0
price                             0
km                                0
luxury                            0
competition                       0
xDrive                            0
automat                           0
km_cat                            0
km_per_year                       0
km_per_year_cat                   0
dtype: int64


Unnamed: 0,web-scraper-order,web-scraper-start-url,text_raw,price_raw,model_raw,type_year_km_transmission_fuel,Location,PLZ,km_raw,Year,price,km,luxury,competition,xDrive,automat,km_cat,km_per_year,km_per_year_cat


## Count and identify duplicated values (if any)

It seems like there are no duplicates in the DataFrame

In [23]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['web-scraper-order', 'text_raw', 'model_raw']].duplicated()]

0


Unnamed: 0,web-scraper-order,web-scraper-start-url,text_raw,price_raw,model_raw,type_year_km_transmission_fuel,Location,PLZ,km_raw,Year,price,km,luxury,competition,xDrive,automat,km_cat,km_per_year,km_per_year_cat


### Save data to file

In [24]:
df.to_csv('bmw_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### SQLLite3

In [25]:
import sqlite3
 
# Daten laden, indem Sie den vollständigen Pfad zur Datei angeben
bereinigte_daten = pd.read_csv('/workspaces/DA_Project/Project/Minimum/1/bmw_data_prepared.csv')
 
# Verbindung zur Datenbank herstellen
conn = sqlite3.connect('bmw_datenbank.db')
 
# Daten in die Datenbank einfügen
bereinigte_daten.to_sql('bmw_tabelle', conn, index=False, if_exists='replace')
 
# Verbindung schließen
conn.close()

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [26]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.2.0-1018-azure
Datetime: 2024-01-12 16:29:59
Python Version: 3.10.13
-----------------------------------
