In [22]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

# Loading the Data

In [23]:
# Load data
df = pd.read_csv("/kaggle/input/marketing-data/ifood_df.csv")

# Data Overview

In [24]:
# Display first 5 rows of the dataset
df.head()

Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,marital_Together,marital_Widow,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,MntTotal,MntRegularProds,AcceptedCmpOverall
0,58138.0,0,0,58,635,88,546,172,88,88,...,0,0,0,0,1,0,0,1529,1441,0
1,46344.0,1,1,38,11,1,6,2,1,6,...,0,0,0,0,1,0,0,21,15,0
2,71613.0,0,0,26,426,49,127,111,21,42,...,1,0,0,0,1,0,0,734,692,0
3,26646.0,1,0,26,11,4,20,10,3,5,...,1,0,0,0,1,0,0,48,43,0
4,58293.0,1,0,94,173,43,118,46,27,15,...,0,0,0,0,0,0,1,407,392,0


In [25]:
# Get a short summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2205 entries, 0 to 2204
Data columns (total 39 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Income                2205 non-null   float64
 1   Kidhome               2205 non-null   int64  
 2   Teenhome              2205 non-null   int64  
 3   Recency               2205 non-null   int64  
 4   MntWines              2205 non-null   int64  
 5   MntFruits             2205 non-null   int64  
 6   MntMeatProducts       2205 non-null   int64  
 7   MntFishProducts       2205 non-null   int64  
 8   MntSweetProducts      2205 non-null   int64  
 9   MntGoldProds          2205 non-null   int64  
 10  NumDealsPurchases     2205 non-null   int64  
 11  NumWebPurchases       2205 non-null   int64  
 12  NumCatalogPurchases   2205 non-null   int64  
 13  NumStorePurchases     2205 non-null   int64  
 14  NumWebVisitsMonth     2205 non-null   int64  
 15  AcceptedCmp3         

In [None]:
# Display basic statistical details
df.describe().T

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Print the number of missing values for each column
print(missing_values)

# Data Preprocessing

In [None]:
df.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)

In [None]:
df[["Complain"]].plot()
df[["NumStorePurchases"]].plot()
df[["Customer_Days"]].plot()

In [None]:
df['Kidhome'] = df['Kidhome'].astype('category')
df['Teenhome'] = df['Teenhome'].astype('category')
df['AcceptedCmp1'] = df['AcceptedCmp1'].astype('category')
df['AcceptedCmp2'] = df['AcceptedCmp2'].astype('category')
df['AcceptedCmp3'] = df['AcceptedCmp3'].astype('category')
df['AcceptedCmp4'] = df['AcceptedCmp4'].astype('category')
df['AcceptedCmp5'] = df['AcceptedCmp5'].astype('category')
df['Complain'] = df['Complain'].astype('category')
df['marital_Divorced'] = df['marital_Divorced'].astype('category')
df['marital_Married'] = df['marital_Married'].astype('category')
df['marital_Single'] = df['marital_Single'].astype('category')
df['marital_Together'] = df['marital_Together'].astype('category')
df['marital_Widow'] = df['marital_Widow'].astype('category')
df['education_2n Cycle'] = df['education_2n Cycle'].astype('category')
df['education_Basic'] = df['education_Basic'].astype('category')
df['education_Graduation'] = df['education_Graduation'].astype('category')
df['education_Master'] = df['education_Master'].astype('category')
df['education_PhD'] = df['education_PhD'].astype('category')

In [None]:
df.info()

# Data Visualization

In [None]:
import matplotlib.pyplot as plt

# Plotting a histogram for the 'Income' column
df['Income'].hist(bins=50)
plt.title('Income Distribution')
plt.xlabel('Income')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Plotting a box plot for the 'Income' column
df.boxplot(column=['Income'])
plt.title('Income Box Plot')
plt.ylabel('Income')
plt.show()


In [None]:
# Counting the values of a categorical column, e.g., 'education_Master'
education_master_counts = df['education_Master'].value_counts()

# Plotting a bar chart
education_master_counts.plot(kind='bar')
plt.title('Counts of Education Master')
plt.xlabel('Category')
plt.ylabel('Counts')
plt.show()

In [None]:
# Plotting a box plot for the 'Income' column
df.boxplot(column=['Age'])
plt.title('Age Box Plot')
plt.ylabel('Age')
plt.show()

# Correlation Analysis

In [None]:
correlation_matrix = df.corr()

# Display the correlation matrix
print(correlation_matrix)

In [None]:
import seaborn as sns

# Plotting the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.1)
plt.title('Correlation Matrix Heatmap')
plt.show()

# Descriptive Statistics

In [None]:
# Separate numerical and categorical variables
numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns
categorical_vars = df.select_dtypes(include=['category']).columns

# Generate descriptive statistics for numerical columns
numerical_stats = df[numerical_vars].describe()

# Generate descriptive statistics for categorical columns
categorical_stats = df[categorical_vars].describe()

# Display the results
print(numerical_stats)

In [None]:
print(categorical_stats)

In [None]:
# Compute the correlation matrix for numerical variables
corr_matrix_numerical = df[numerical_vars].corr()

# Plotting the heatmap for numerical variables
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix_numerical, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap for Numerical Variables')
plt.show()

# Pairplot and Distribution Plots

In [None]:
# Creating pairplot for numerical variables
sns.pairplot(df[numerical_vars])
plt.show()

In [None]:
# For numerical variables
for col in numerical_vars:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=False)
    plt.title(f'Distribution of {col}')
    plt.show()

# For categorical variables
for col in categorical_vars:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=df[col])
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

# Conclusion
This EDA provides insights into the distribution, relationships, and characteristics of the marketing dataset. Further analysis and modeling can be performed based on these observations.