In [2]:
import pandas as pd

# Mount Google Drive if your file is in Drive, otherwise upload directly.
# from google.colab import drive
# drive.mount('/content/drive')

# If the file is uploaded directly to Colab (e.g., drag and drop):
# Make sure to upload the 'tips (2) (1).csv' file to your Colab session.
df = pd.read_csv('tips (2) (1).csv')

# Display the first 5 rows of the DataFrame
print("First 5 rows of the dataset:")
print(df.head())

# Display the column names and their data types
print("\nColumn information:")
print(df.info())

First 5 rows of the dataset:
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
None


In [3]:
# Select the numerical columns for which to calculate statistics
numerical_columns = ['total_bill', 'tip', 'size']

# Initialize a dictionary to store the statistics
stats_dict = {}

# Calculate statistics for each selected column
for col in numerical_columns:
    stats_dict[col] = {
        'mean': df[col].mean(),
        'median': df[col].median(),
        'mode': df[col].mode().iloc[0] if not df[col].mode().empty else None, # .iloc[0] to get the first mode if multiple exist
        'std': df[col].std(),
        'min': df[col].min(),
        'max': df[col].max()
    }

# Convert the dictionary to a pandas DataFrame for a tabular display
stats_df = pd.DataFrame(stats_dict)

# Transpose the DataFrame to have statistics as rows and columns as index
stats_df_transposed = stats_df.T

# Display the resulting table
print("Descriptive Statistics for Key Numerical Columns:")
print(stats_df_transposed.to_markdown(numalign="left", stralign="left"))

Descriptive Statistics for Key Numerical Columns:
|            | mean    | median   | mode   | std     | min   | max   |
|:-----------|:--------|:---------|:-------|:--------|:------|:------|
| total_bill | 19.7859 | 17.795   | 13.42  | 8.90241 | 3.07  | 50.81 |
| tip        | 2.99828 | 2.9      | 2      | 1.38364 | 1     | 10    |
| size       | 2.56967 | 2        | 2      | 0.9511  | 1     | 6     |


In [4]:
# --- Check for Missing Values ---
print("--- Missing Values Check ---")
missing_values = df.isnull().sum()
print("Number of missing values per column:")
print(missing_values)

if missing_values.sum() == 0:
    print("\nNo missing values found in the dataset.")
else:
    print("\nMissing values found in the dataset. Please review the table above.")

--- Missing Values Check ---
Number of missing values per column:
total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

No missing values found in the dataset.


In [5]:
# --- Check for Outliers (using IQR method) ---
print("\n--- Outlier Check (using IQR method) ---")
numerical_columns = ['total_bill', 'tip', 'size']

for col in numerical_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

    print(f"\nColumn: '{col}'")
    print(f"  Q1 (25th percentile): {Q1:.2f}")
    print(f"  Q3 (75th percentile): {Q3:.2f}")
    print(f"  IQR: {IQR:.2f}")
    print(f"  Lower Bound for Outliers: {lower_bound:.2f}")
    print(f"  Upper Bound for Outliers: {upper_bound:.2f}")
    print(f"  Number of outliers detected: {len(outliers)}")

    if not outliers.empty:
        print("  Outlier values (first 5 if more):")
        # print first 5 rows of outlier values
        print(outliers[[col]].head())


--- Outlier Check (using IQR method) ---

Column: 'total_bill'
  Q1 (25th percentile): 13.35
  Q3 (75th percentile): 24.13
  IQR: 10.78
  Lower Bound for Outliers: -2.82
  Upper Bound for Outliers: 40.30
  Number of outliers detected: 9
  Outlier values (first 5 if more):
     total_bill
59        48.27
102       44.30
142       41.19
156       48.17
170       50.81

Column: 'tip'
  Q1 (25th percentile): 2.00
  Q3 (75th percentile): 3.56
  IQR: 1.56
  Lower Bound for Outliers: -0.34
  Upper Bound for Outliers: 5.91
  Number of outliers detected: 9
  Outlier values (first 5 if more):
       tip
23    7.58
47    6.00
59    6.73
141   6.70
170  10.00

Column: 'size'
  Q1 (25th percentile): 2.00
  Q3 (75th percentile): 3.00
  IQR: 1.00
  Lower Bound for Outliers: 0.50
  Upper Bound for Outliers: 4.50
  Number of outliers detected: 9
  Outlier values (first 5 if more):
     size
125     6
141     6
142     5
143     6
155     5


In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assume 'df' DataFrame is already loaded from previous steps (e.g., df = pd.read_csv('tips (2) (1).csv'))

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print("--- Univariate Analysis and Visualization ---")

# --- Analysis for Numerical Columns ---
print("\nAnalyzing Numerical Columns:")
for col in numerical_cols:
    print(f"\nColumn: '{col}'")
    # Display descriptive statistics for the numerical column
    print(df[col].describe())

    plt.figure(figsize=(18, 5))

    # Plot 1: Histogram with KDE
    plt.subplot(1, 3, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}', fontsize=14)
    plt.xlabel(col, fontsize=12)
    plt.ylabel('Frequency', fontsize=12)

    # Plot 2: Box Plot
    plt.subplot(1, 3, 2)
    sns.boxplot(x=df[col])
    plt.title(f'Box Plot of {col}', fontsize=14)
    plt.xlabel(col, fontsize=12)

    # Plot 3: KDE Plot (Density Plot)
    plt.subplot(1, 3, 3)
    sns.kdeplot(df[col], fill=True)
    plt.title(f'Density Plot of {col}', fontsize=14)
    plt.xlabel(col, fontsize=12)
    plt.ylabel('Density', fontsize=12)

    plt.tight_layout()
    plt.savefig(f'{col}_univariate_numerical.png') # Save the plot
    plt.close() # Close the plot to free memory

# --- Analysis for Categorical Columns ---
print("\nAnalyzing Categorical Columns:")
for col in categorical_cols:
    print(f"\nColumn: '{col}'")
    print("Value Counts:")
    # Display value counts for the categorical column
    print(df[col].value_counts())
    print("\nProportions:")
    # Display proportions for the categorical column
    print(df[col].value_counts(normalize=True))

    plt.figure(figsize=(8, 6))
    sns.countplot(data=df, x=col, order=df[col].value_counts().index)
    plt.title(f'Count Plot of {col}', fontsize=14)
    plt.xlabel(col, fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45, ha='right') # Rotate labels for better readability if many categories
    plt.tight_layout()
    plt.savefig(f'{col}_univariate_categorical.png') # Save the plot
    plt.close() # Close the plot to free memory

print("\nUnivariate analysis and visualizations completed. Check the generated PNG files for plots.")

--- Univariate Analysis and Visualization ---

Analyzing Numerical Columns:

Column: 'total_bill'
count    244.000000
mean      19.785943
std        8.902412
min        3.070000
25%       13.347500
50%       17.795000
75%       24.127500
max       50.810000
Name: total_bill, dtype: float64

Column: 'tip'
count    244.000000
mean       2.998279
std        1.383638
min        1.000000
25%        2.000000
50%        2.900000
75%        3.562500
max       10.000000
Name: tip, dtype: float64

Column: 'size'
count    244.000000
mean       2.569672
std        0.951100
min        1.000000
25%        2.000000
50%        2.000000
75%        3.000000
max        6.000000
Name: size, dtype: float64

Analyzing Categorical Columns:

Column: 'sex'
Value Counts:
sex
Male      157
Female     87
Name: count, dtype: int64

Proportions:
sex
Male      0.643443
Female    0.356557
Name: proportion, dtype: float64

Column: 'smoker'
Value Counts:
smoker
No     151
Yes     93
Name: count, dtype: int64

Proportio

In [7]:
print("--- Bivariate Analysis and Visualization ---")

# Define numerical and categorical columns
numerical_cols = ['total_bill', 'tip', 'size']
categorical_cols = ['sex', 'smoker', 'day', 'time']

# --- Numerical vs. Numerical Analysis ---

print("\n--- Numerical vs. Numerical Analysis ---")

# 1. Total Bill vs. Tip: Scatter Plot with Regression Line
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='total_bill', y='tip')
sns.regplot(data=df, x='total_bill', y='tip', scatter=False, color='red', line_kws={"linestyle":"--"}) # Add regression line
plt.title('Total Bill vs. Tip', fontsize=16)
plt.xlabel('Total Bill ($)', fontsize=12)
plt.ylabel('Tip ($)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('total_bill_vs_tip_scatterplot.png')
plt.close()

# Calculate and print correlation
correlation = df['total_bill'].corr(df['tip'])
print(f"Correlation between Total Bill and Tip: {correlation:.2f}")

# 2. Party Size vs. Total Bill: Scatter Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='size', y='total_bill')
plt.title('Party Size vs. Total Bill', fontsize=16)
plt.xlabel('Party Size', fontsize=12)
plt.ylabel('Total Bill ($)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('size_vs_total_bill_scatterplot.png')
plt.close()

# 3. Party Size vs. Tip: Scatter Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='size', y='tip')
plt.title('Party Size vs. Tip', fontsize=16)
plt.xlabel('Party Size', fontsize=12)
plt.ylabel('Tip ($)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('size_vs_tip_scatterplot.png')
plt.close()


# --- Numerical vs. Categorical Analysis ---

print("\n--- Numerical vs. Categorical Analysis ---")

# 1. Total Bill by Sex: Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='sex', y='total_bill')
plt.title('Total Bill by Sex', fontsize=16)
plt.xlabel('Sex', fontsize=12)
plt.ylabel('Total Bill ($)', fontsize=12)
plt.tight_layout()
plt.savefig('total_bill_by_sex_boxplot.png')
plt.close()

# 2. Tip by Smoker Status: Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='smoker', y='tip')
plt.title('Tip by Smoker Status', fontsize=16)
plt.xlabel('Smoker', fontsize=12)
plt.ylabel('Tip ($)', fontsize=12)
plt.tight_layout()
plt.savefig('tip_by_smoker_boxplot.png')
plt.close()

# 3. Total Bill by Day of the Week: Box Plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='day', y='total_bill', order=['Thur', 'Fri', 'Sat', 'Sun'])
plt.title('Total Bill by Day of the Week', fontsize=16)
plt.xlabel('Day', fontsize=12)
plt.ylabel('Total Bill ($)', fontsize=12)
plt.tight_layout()
plt.savefig('total_bill_by_day_boxplot.png')
plt.close()

# 4. Tip by Time of Day: Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='time', y='tip')
plt.title('Tip by Time of Day', fontsize=16)
plt.xlabel('Time', fontsize=12)
plt.ylabel('Tip ($)', fontsize=12)
plt.tight_layout()
plt.savefig('tip_by_time_boxplot.png')
plt.close()


# --- Categorical vs. Categorical Analysis ---

print("\n--- Categorical vs. Categorical Analysis ---")

# 1. Sex vs. Smoker Status: Count Plot
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='sex', hue='smoker', palette='viridis')
plt.title('Count of Sex by Smoker Status', fontsize=16)
plt.xlabel('Sex', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Smoker')
plt.tight_layout()
plt.savefig('sex_vs_smoker_countplot.png')
plt.close()

# 2. Day vs. Time: Cross-tabulation and Heatmap
print("\nCross-tabulation of Day and Time:")
cross_tab_day_time = pd.crosstab(df['day'], df['time'])
print(cross_tab_day_time)

plt.figure(figsize=(9, 7))
sns.heatmap(cross_tab_day_time, annot=True, fmt='d', cmap='YlGnBu', linewidths=.5)
plt.title('Heatmap of Meals by Day and Time', fontsize=16)
plt.xlabel('Time', fontsize=12)
plt.ylabel('Day', fontsize=12)
plt.tight_layout()
plt.savefig('day_vs_time_heatmap.png')
plt.close()

print("\nBivariate analysis and visualizations completed. Check the generated PNG files for plots.")

--- Bivariate Analysis and Visualization ---

--- Numerical vs. Numerical Analysis ---
Correlation between Total Bill and Tip: 0.68

--- Numerical vs. Categorical Analysis ---

--- Categorical vs. Categorical Analysis ---

Cross-tabulation of Day and Time:
time  Dinner  Lunch
day                
Fri       12      7
Sat       87      0
Sun       76      0
Thur       1     61

Bivariate analysis and visualizations completed. Check the generated PNG files for plots.
