In [None]:
# ========================
# 1. Import Libraries
# ========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Display settings
pd.set_option("display.max_columns", None)

# ========================
# 2. Load Dataset
# ========================
df = pd.read_csv("tips.csv")

# Show first and last rows
print("First 5 rows:\n", df.head())
print("\nLast 5 rows:\n", df.tail())

# Info & Summary
print("\nData Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe(include="all"))

# ========================
# 3. Data Cleaning
# ========================
# Check missing values
print("\nMissing Values:\n", df.isnull().sum())

# Check duplicates
print("\nDuplicate Rows:", df.duplicated().sum())
df = df.drop_duplicates()

# Detect outliers using boxplot & Z-score
plt.figure(figsize=(8,5))
sns.boxplot(data=df[['total_bill','tip','size']])
plt.title("Boxplots for Numerical Variables")
plt.show()

z_scores = np.abs(stats.zscore(df[['total_bill','tip','size']]))
outliers = (z_scores > 3).any(axis=1)
print("\nNumber of Outliers Detected:", outliers.sum())

# ========================
# 4. Univariate Analysis
# ========================
# Numerical Variables
for col in ['total_bill','tip','size']:
    print(f"\n--- {col.upper()} ---")
    print("Mean:", df[col].mean())
    print("Median:", df[col].median())
    print("Trimmed Mean:", stats.trim_mean(df[col], 0.1))  # 10% trimmed mean
    print("Range:", df[col].max() - df[col].min())
    print("Variance:", df[col].var())
    print("Standard Deviation:", df[col].std())
    
    # Histogram + KDE
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True, bins=20)
    plt.title(f"Distribution of {col}")
    plt.show()
    
    # Boxplot
    plt.figure(figsize=(6,2))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

# Categorical Variables
for col in ['sex','smoker','day','time']:
    print(f"\n--- {col.upper()} ---")
    print(df[col].value_counts())
    print("Mode:", df[col].mode()[0])
    
    plt.figure(figsize=(5,4))
    sns.countplot(x=col, data=df)
    plt.title(f"Frequency of {col}")
    plt.show()

# ========================
# 5. Bivariate / Multivariate Analysis
# ========================

# Scatter Plot: total_bill vs tip
plt.figure(figsize=(6,5))
sns.scatterplot(x="total_bill", y="tip", hue="sex", data=df)
plt.title("Total Bill vs Tip")
plt.show()

# Correlation Matrix
corr = df[['total_bill','tip','size']].corr()
print("\nCorrelation Matrix:\n", corr)

plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# Grouped Boxplots
plt.figure(figsize=(6,4))
sns.boxplot(x="sex", y="tip", hue="smoker", data=df)
plt.title("Tip Distribution by Sex & Smoker")
plt.show()

# Bar Chart: Average tip by day
avg_tips = df.groupby("day")["tip"].mean().reset_index()
plt.figure(figsize=(6,4))
sns.barplot(x="day", y="tip", data=avg_tips)
plt.title("Average Tip by Day")
plt.show()

# ========================
# 6. Conclusion
# ========================
print("\nKey Insights:")
print("1. Tips are positively correlated with total bill (higher bill → higher tip).")
print("2. Majority of customers are non-smokers and males.")
print("3. Sunday has the highest average tips.")
print("4. Outliers exist in total_bill and tip (very high spending customers).")
print("5. Tip percentage is generally between 10% and 20% of the bill.")
