In [98]:
import pandas as pd

url ="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


- explore the data 

In [99]:
df.shape
df.columns
df.info()
df.describe()

<class 'pandas.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    str    
 3   smoker      244 non-null    str    
 4   day         244 non-null    str    
 5   time        244 non-null    str    
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), str(4)
memory usage: 13.5 KB


Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [100]:
df[df["total_bill"] > 20]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
237,32.83,1.17,Male,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2


In [101]:
df.groupby("day")["total_bill"].mean()

day
Fri     17.151579
Sat     20.441379
Sun     21.410000
Thur    17.682742
Name: total_bill, dtype: float64

In [102]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

- dropna() removed rows with missing values to ensure accurate analysis and model training

In [103]:
df_clean = df.dropna()

- Remove duplicates using drop_duplicates() which will help us with clean data model training 

In [104]:
df_clean = df_clean.drop_duplicates()

- Using clean data - get the insights from the dataset

In [105]:
df_clean["total_bill"].mean()

np.float64(19.813868312757204)

In [106]:
df_clean["tip"].sum()

np.float64(729.5799999999999)

In [107]:
df_clean.groupby("day")["tip"].mean()

day
Fri     2.734737
Sat     2.993103
Sun     3.255132
Thur    2.784098
Name: tip, dtype: float64

In [108]:
df_clean.groupby("day")["total_bill"].mean()


day
Fri     17.151579
Sat     20.441379
Sun     21.410000
Thur    17.759508
Name: total_bill, dtype: float64

- Which day earns highest tips?

In [109]:
df_clean.groupby("day")["tip"].mean()

day
Fri     2.734737
Sat     2.993103
Sun     3.255132
Thur    2.784098
Name: tip, dtype: float64

- Do males or females tip more?


In [110]:
df_clean.groupby("sex")["tip"].mean()

sex
Female    2.843140
Male      3.089618
Name: tip, dtype: float64

- Is dinner bill higher than lunch?

In [111]:
df_clean.groupby("time")["total_bill"].mean()

time
Dinner    20.797159
Lunch     17.230896
Name: total_bill, dtype: float64

In [112]:
df_clean.sort_values("tip", ascending=False).head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
170,50.81,10.0,Male,Yes,Sat,Dinner,3
212,48.33,9.0,Male,No,Sat,Dinner,4
23,39.42,7.58,Male,No,Sat,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
141,34.3,6.7,Male,No,Thur,Lunch,6


Top 5 highest tips mostly occur on Saturday during Dinner time

In [113]:
df_clean.to_csv("cleaned_tips.csv", index=False)