# Importing and preparing rental apartments data

## Libraries and settings

In [1]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

/workspaces/DA_Project/Project/Minimum/1_old


## Importing data

In [2]:
# Show .csv - files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('comparis_bmw.csv', sep=',', encoding='utf-8')

# Show first records of data frame
df.head()

comparis_bmw.csv
bmw_data_prepared.csv


Unnamed: 0,web-scraper-order,web-scraper-start-url,text_raw,price_raw,model_raw,type_year_km_transmission_fuel
0,1704481912-1,https://www.comparis.ch/carfinder/marktplatz/b...,BMW 220 Gran Tourer 220d xDrive Gran Tourer St...,CHF 21'000,220d xDrive Gran Tourer Steptronic,Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1,1704481912-2,https://www.comparis.ch/carfinder/marktplatz/b...,BMW X5M X5M SteptronicCHF 38'500ab CHF 616 / M...,CHF 38'500,X5M Steptronic,Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2,1704481912-3,https://www.comparis.ch/carfinder/marktplatz/b...,BMW X3 X3 M40d Individual SteptronicCHF 44'990...,CHF 44'990,X3 M40d Individual Steptronic,Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3,1704481912-4,https://www.comparis.ch/carfinder/marktplatz/b...,BMW X5 X5 xDrive 30d SteptronicCHF 26'500ab CH...,CHF 26'500,X5 xDrive 30d Steptronic,Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4,1704481912-5,https://www.comparis.ch/carfinder/marktplatz/b...,BMW 320 320i Cabriolet SteptronicCHF 16'750ab ...,CHF 16'750,320i Cabriolet Steptronic,Occasion06.201079.600 kmAutomatBenzin4461 (BL)


## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (50, 6)
Number of rows: 50
Number of columns: 6


## Get data types (raw-format from web scraping)

In [4]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order                 object
web-scraper-start-url             object
text_raw                          object
price_raw                         object
model_raw                         object
type_year_km_transmission_fuel    object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Output type_year_km_transmission_fuel

In [5]:
print(df['type_year_km_transmission_fuel'].head(5), '\n')

0    Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1    Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2     Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3    Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4     Occasion06.201079.600 kmAutomatBenzin4461 (BL)
Name: type_year_km_transmission_fuel, dtype: object 



## Extract and save relevant information from raw data using regular expressions (regex)

### Extract Location

In [6]:
# Extrahiere Location
Location = []
for item in df['type_year_km_transmission_fuel']:
    location = re.findall(r'\((.*?)\)', item)
    try:
        Location.append(location[0])
    except:
        Location.append(None)

# Füge Location als neue Spalte hinzu
df['Location'] = pd.Series(Location, dtype='str')

# Zeige die ersten 5 Werte 
print(df['type_year_km_transmission_fuel'].head(5), '\n')
print(df['Location'].head(5))

0    Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1    Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2     Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3    Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4     Occasion06.201079.600 kmAutomatBenzin4461 (BL)
Name: type_year_km_transmission_fuel, dtype: object 

0    VD
1    SG
2    FR
3    BL
4    BL
Name: Location, dtype: object


### Extract PLZ

In [7]:
# Extrahiere PLZ
PLZ = []
for item in df['type_year_km_transmission_fuel']:
    plz = re.search(r'(\d{4}) \(([^)]+)\)', item)
    if plz:
        PLZ.append(plz.group(1))
    else:
        PLZ.append(None)

# Füge PLZ als neue Spalte hinzu
df['PLZ'] = pd.Series(PLZ, dtype="Int64")

# Zeige die ersten 5 Werte
print(df['type_year_km_transmission_fuel'].head(5), '\n')
print(df['PLZ'].head(5))

0    Occasion10.2017107.000 kmAutomatDiesel1299 (VD)
1    Occasion08.2016171.000 kmAutomatBenzin8730 (SG)
2     Occasion12.201988.900 kmAutomatDiesel3178 (FR)
3    Occasion11.2016164.900 kmAutomatDiesel4461 (BL)
4     Occasion06.201079.600 kmAutomatBenzin4461 (BL)
Name: type_year_km_transmission_fuel, dtype: object 

0    1299
1    8730
2    3178
3    4461
4    4461
Name: PLZ, dtype: Int64


### Extraction of all Information

In [8]:
# Extrahieren von Typ, Jahr, Kilometerstand, Getriebeart, Treibstofftyp, Postleitzahl und Location
extracted = df['type_year_km_transmission_fuel'].str.extract(r'(\w+)(\d{2}.\d{4})(\d+\.\d+ km)([A-Z]\w+)([A-Z]\w+)(\d{4}) \(([^)]+)\)')
extracted.columns = ['Type', 'Year', 'Kilometers', 'Transmission', 'Fuel', 'PLZ', 'Location']

# Ausgabe der ersten 5 extrahierten Datensätze
print(extracted.head())

       Type     Year  Kilometers Transmission    Fuel   PLZ Location
0  Occasion  10.2017  107.000 km      Automat  Diesel  1299       VD
1  Occasion  08.2016  171.000 km      Automat  Benzin  8730       SG
2  Occasion  12.2019   88.900 km      Automat  Diesel  3178       FR
3  Occasion  11.2016  164.900 km      Automat  Diesel  4461       BL
4  Occasion  06.2010   79.600 km      Automat  Benzin  4461       BL


### Output price_raw

In [9]:
print(df['price_raw'].head(5), '\n')

0    CHF 21'000
1    CHF 38'500
2    CHF 44'990
3    CHF 26'500
4    CHF 16'750
Name: price_raw, dtype: object 



In [10]:
# Verwende str.extract(), um Zahlen aus der 'price_raw'-Spalte zu extrahieren
df['price'] = df['price_raw'].str.extract(r"(\d+['\d+]*)", expand=False)
df['price'] = df['price'].str.replace("'", "").astype('Int64')  # Entferne Apostrophe und ändere den Datentyp in 'Int64'

# Gib die ersten 5 Werte der 'price_raw'- und 'price'-Spalten aus
print(df['price_raw'].head(5), '\n')
print(df['price'].head(5))

0    CHF 21'000
1    CHF 38'500
2    CHF 44'990
3    CHF 26'500
4    CHF 16'750
Name: price_raw, dtype: object 

0    21000
1    38500
2    44990
3    26500
4    16750
Name: price, dtype: Int64


### Get data types of all variables including the new ones

In [11]:
df.dtypes

web-scraper-order                 object
web-scraper-start-url             object
text_raw                          object
price_raw                         object
model_raw                         object
type_year_km_transmission_fuel    object
Location                          object
PLZ                                Int64
price                              Int64
dtype: object

## Count and identify missing values (if any)

It seems like all the rows in the DataFrame df have complete data without any missing values

In [12]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)].head()

web-scraper-order                 0
web-scraper-start-url             0
text_raw                          0
price_raw                         0
model_raw                         0
type_year_km_transmission_fuel    0
Location                          0
PLZ                               0
price                             0
dtype: int64


Unnamed: 0,web-scraper-order,web-scraper-start-url,text_raw,price_raw,model_raw,type_year_km_transmission_fuel,Location,PLZ,price


## Count and identify duplicated values (if any)

It seems like there are no duplicates in the DataFrame

In [13]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['web-scraper-order', 'text_raw', 'model_raw']].duplicated()]

0


Unnamed: 0,web-scraper-order,web-scraper-start-url,text_raw,price_raw,model_raw,type_year_km_transmission_fuel,Location,PLZ,price


### Save data to file

In [14]:
df.to_csv('bmw_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### SQLlite

In [15]:
import sqlite3
 
# Daten laden, indem Sie den vollständigen Pfad zur Datei angeben
bereinigte_daten = pd.read_csv('/workspaces/DA_Project/Project/Minimum/1/bmw_data_prepared.csv')
 
# Verbindung zur Datenbank herstellen
conn = sqlite3.connect('bmw_datenbank.db')
 
# Daten in die Datenbank einfügen
bereinigte_daten.to_sql('bmw_tabelle', conn, index=False, if_exists='replace')
 
# Verbindung schließen
conn.close()

FileNotFoundError: [Errno 2] No such file or directory: '/workspaces/DA_Project/Project/Minimum/1/bmw_data_prepared.csv'

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.2.0-1018-azure
Datetime: 2024-01-12 16:03:47
Python Version: 3.10.13
-----------------------------------
