<p><font size="6" color='grey'> <b>
Machine Learning
</b></font> </br></p>
<p><font size="5" color='grey'> <b>
Pandas Basic - Daten zu Restaurantbesuche
</b></font> </br></p>

---

# Import Bibliotheken

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

# Daten vorbereiten

In [2]:
tips = px.data.tips()

data = tips.drop(["tip"], axis=1)
target = tips.tip

data.head(4)

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2


# Information zu Daten abfragen


In [3]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


In [4]:
data.columns

Index(['total_bill', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [5]:
data.dtypes

Unnamed: 0,0
total_bill,float64
sex,object
smoker,object
day,object
time,object
size,int64


In [6]:
data.shape

(244, 6)

In [7]:
duplicates = data["time"].duplicated(keep="first")

In [8]:
data.ndim

2

In [9]:
data.size

1464

In [10]:
data["total_bill"].mean()

np.float64(19.78594262295082)

In [11]:
data.corr(numeric_only=True, method="pearson")
data.corr(numeric_only=True, method="spearman")

Unnamed: 0,total_bill,size
total_bill,1.0,0.604791
size,0.604791,1.0


In [12]:
data.corrwith(
    data["total_bill"], numeric_only=True
)  # method{‘pearson’, ‘kendall’, ‘spearman’} or callable

Unnamed: 0,0
total_bill,1.0
size,0.598315


In [13]:
data.count()

Unnamed: 0,0
total_bill,244
sex,244
smoker,244
day,244
time,244
size,244


In [14]:
data.cov(numeric_only=True)

Unnamed: 0,total_bill,size
total_bill,79.252939,5.065983
size,5.065983,0.904591


In [15]:
data.std(numeric_only=True)

Unnamed: 0,0
total_bill,8.902412
size,0.9511


In [16]:
data.isna().sum()

Unnamed: 0,0
total_bill,0
sex,0
smoker,0
day,0
time,0
size,0


# EDA - Exploratory Data Analysis

In [17]:
!uv pip install -q https://github.com/ydataai/ydata-profiling/archive/master.zip[notebook]

In [18]:
# Import
import numpy as np
import ydata_profiling
from ydata_profiling.utils.cache import cache_file

In [19]:
# Bericht erstellen und als html speichern
profile_report = tips.profile_report(explorative=True, html={"style": {"full_width": True}})
profile_report.to_file("eda_report.html")
profile_report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/7 [00:00<?, ?it/s][A
100%|██████████| 7/7 [00:00<00:00, 57.81it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]



# DataFrame - Indizierung

In [20]:
# Spalte/Series
data["time"]

Unnamed: 0,time
0,Dinner
1,Dinner
2,Dinner
3,Dinner
4,Dinner
...,...
239,Dinner
240,Dinner
241,Dinner
242,Dinner


In [21]:
# Zeile, Zeile von/bis
data.iloc[10]
data.iloc[10:15]

Unnamed: 0,total_bill,sex,smoker,day,time,size
10,10.27,Male,No,Sun,Dinner,2
11,35.26,Female,No,Sun,Dinner,4
12,15.42,Male,No,Sun,Dinner,2
13,18.43,Male,No,Sun,Dinner,4
14,14.83,Female,No,Sun,Dinner,2


In [22]:
# Teilbereiche: Zeile von/bis und Spalte von/bis
data.iloc[10:15, 2:4]

Unnamed: 0,smoker,day
10,No,Sun
11,No,Sun
12,No,Sun
13,No,Sun
14,No,Sun


In [23]:
# Teilbereiche alle Zeile, 1-2 Spalte
data.loc[:, "time"]
data.loc[:, ["time", "day"]]
data.loc[:, "time":"day"]

0
1
2
3
4
...
239
240
241
242
243


In [24]:
# eine Zelle mit loc/iloc
data.iloc[10, 0]
data.loc[10, "day"]

'Sun'

# DataFrame aufbreiten

In [25]:
data.isna().sum()  # Detect missing values

Unnamed: 0,0
total_bill,0
sex,0
smoker,0
day,0
time,0
size,0


In [26]:
data.fillna(0)  # Fill NA/NaN values using the specified method. Hier fehlende Werte werden durch 0 ersetzt

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,Male,No,Sat,Dinner,3
240,27.18,Female,Yes,Sat,Dinner,2
241,22.67,Male,Yes,Sat,Dinner,2
242,17.82,Male,No,Sat,Dinner,2


In [27]:
data["total_bill"] = data["total_bill"].replace(10.27, 10.99)

# DataFrames copy, insert, delete, drop

In [28]:
data_neu = data.copy()
data_neu.shape

(244, 6)

In [29]:
data.insert(4, "target", data.sex)

In [30]:
data.pop("target")  # Return item and delete from frame.

Unnamed: 0,target
0,Female
1,Male
2,Male
3,Male
4,Female
...,...
239,Male
240,Female
241,Male
242,Male


In [31]:
data.drop(["size"], axis=1)  # return frame without specified column  - not delete

Unnamed: 0,total_bill,sex,smoker,day,time
0,16.99,Female,No,Sun,Dinner
1,10.34,Male,No,Sun,Dinner
2,21.01,Male,No,Sun,Dinner
3,23.68,Male,No,Sun,Dinner
4,24.59,Female,No,Sun,Dinner
...,...,...,...,...,...
239,29.03,Male,No,Sat,Dinner
240,27.18,Female,Yes,Sat,Dinner
241,22.67,Male,Yes,Sat,Dinner
242,17.82,Male,No,Sat,Dinner


In [32]:
data.dropna()  # Remove missing values. axis{0 or ‘index’, 1 or ‘columns’}, default 0

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,Male,No,Sat,Dinner,3
240,27.18,Female,Yes,Sat,Dinner,2
241,22.67,Male,Yes,Sat,Dinner,2
242,17.82,Male,No,Sat,Dinner,2


# DataFrames iterieren

In [33]:
# Iterate over (column name, Series) pairs.
for label, content in data.items():
    print(f"label: {label}")
    print(f"content: {content}", sep="\n")

label: total_bill
content: 0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64
label: sex
content: 0      Female
1        Male
2        Male
3        Male
4      Female
        ...  
239      Male
240    Female
241      Male
242      Male
243    Female
Name: sex, Length: 244, dtype: object
label: smoker
content: 0       No
1       No
2       No
3       No
4       No
      ... 
239     No
240    Yes
241    Yes
242     No
243     No
Name: smoker, Length: 244, dtype: object
label: day
content: 0       Sun
1       Sun
2       Sun
3       Sun
4       Sun
       ... 
239     Sat
240     Sat
241     Sat
242     Sat
243    Thur
Name: day, Length: 244, dtype: object
label: time
content: 0      Dinner
1      Dinner
2      Dinner
3      Dinner
4      Dinner
        ...  
239    Dinner
240    Dinner
241    Dinner
242    Dinner
243    Dinner
Name: time, Length: 24

In [34]:
for row in data.itertuples():
    print(row)

Pandas(Index=0, total_bill=16.99, sex='Female', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=1, total_bill=10.34, sex='Male', smoker='No', day='Sun', time='Dinner', size=3)
Pandas(Index=2, total_bill=21.01, sex='Male', smoker='No', day='Sun', time='Dinner', size=3)
Pandas(Index=3, total_bill=23.68, sex='Male', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=4, total_bill=24.59, sex='Female', smoker='No', day='Sun', time='Dinner', size=4)
Pandas(Index=5, total_bill=25.29, sex='Male', smoker='No', day='Sun', time='Dinner', size=4)
Pandas(Index=6, total_bill=8.77, sex='Male', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=7, total_bill=26.88, sex='Male', smoker='No', day='Sun', time='Dinner', size=4)
Pandas(Index=8, total_bill=15.04, sex='Male', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=9, total_bill=14.78, sex='Male', smoker='No', day='Sun', time='Dinner', size=2)
Pandas(Index=10, total_bill=10.99, sex='Male', smoker='No', day='Su

# DataFrames logische Operatoren

In [35]:
data.isin([6.2])  # Whether each element in the DataFrame is contained in values, return true/false

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
239,False,False,False,False,False,False
240,False,False,False,False,False,False
241,False,False,False,False,False,False
242,False,False,False,False,False,False


In [36]:
data.total_bill == 6.2

Unnamed: 0,total_bill
0,False
1,False
2,False
3,False
4,False
...,...
239,False
240,False
241,False
242,False


In [37]:
data.total_bill >= 5.0

Unnamed: 0,total_bill
0,True
1,True
2,True
3,True
4,True
...,...
239,True
240,True
241,True
242,True


# DataFrames arithmetische Operatoren

In [38]:
data.total_bill.add(100)

Unnamed: 0,total_bill
0,116.99
1,110.34
2,121.01
3,123.68
4,124.59
...,...
239,129.03
240,127.18
241,122.67
242,117.82


In [39]:
data.total_bill.sub(100)

Unnamed: 0,total_bill
0,-83.01
1,-89.66
2,-78.99
3,-76.32
4,-75.41
...,...
239,-70.97
240,-72.82
241,-77.33
242,-82.18


In [40]:
data["total_bill"].divide(100)

Unnamed: 0,total_bill
0,0.1699
1,0.1034
2,0.2101
3,0.2368
4,0.2459
...,...
239,0.2903
240,0.2718
241,0.2267
242,0.1782


In [41]:
import numpy as np

data.apply(np.sum, axis=0)

Unnamed: 0,0
total_bill,4828.49
sex,FemaleMaleMaleMaleFemaleMaleMaleMaleMaleMaleMa...
smoker,NoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNo...
day,SunSunSunSunSunSunSunSunSunSunSunSunSunSunSunS...
time,DinnerDinnerDinnerDinnerDinnerDinnerDinnerDinn...
size,627
