In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv('wppool_growth_data_sample_20k.csv')
df.head(10)

Unnamed: 0,user_id,install_date,last_active_date,subscription_type,country,total_sessions,page_views,download_clicks,activation_status,days_active,pro_upgrade_date,plan_type,monthly_revenue,churned
0,1,6/29/2023,7/12/2023,Free,UK,3,15,1,1,13,,,0,1
1,2,4/10/2023,7/25/2023,Free,India,133,665,0,1,106,,,0,0
2,3,10/25/2023,12/7/2023,Free,USA,53,106,0,1,43,,,0,0
3,4,8/26/2023,11/9/2023,Pro,Canada,242,242,0,1,75,11/9/2023,Basic,49,0
4,5,5/14/2023,11/22/2023,Free,UK,12,48,0,1,192,,,0,0
5,6,12/17/2023,12/28/2023,Free,Canada,92,460,0,1,11,,,0,1
6,7,7/29/2023,11/26/2023,Free,UK,12,12,0,1,120,,,0,0
7,8,1/12/2023,1/24/2023,Free,USA,115,230,0,1,12,,,0,1
8,9,9/19/2023,12/2/2023,Free,USA,67,201,0,1,74,,,0,0
9,10,4/19/2023,8/21/2023,Free,Germany,46,138,0,1,124,,,0,0


In [6]:
# Step 1: Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())


Missing values in each column:
user_id                  0
install_date             0
last_active_date         0
subscription_type        0
country                  0
total_sessions           0
page_views               0
download_clicks          0
activation_status        0
days_active              0
pro_upgrade_date     15971
plan_type            15971
monthly_revenue          0
churned                  0
dtype: int64


In [2]:
print(df.isnull())

       user_id  install_date  last_active_date  subscription_type  country  \
0        False         False             False              False    False   
1        False         False             False              False    False   
2        False         False             False              False    False   
3        False         False             False              False    False   
4        False         False             False              False    False   
...        ...           ...               ...                ...      ...   
19995    False         False             False              False    False   
19996    False         False             False              False    False   
19997    False         False             False              False    False   
19998    False         False             False              False    False   
19999    False         False             False              False    False   

       total_sessions  page_views  download_clicks  activation_

In [None]:
for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:  # Numerical columns
        df[column].fillna(df[column].median(), inplace=True)
    else:  # Categorical columns
        df[column].fillna(df[column].mode()[0], inplace=True)

# Step 2: Check for duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

In [4]:
# Remove duplicates
df.drop_duplicates(inplace=True)

In [7]:
# Step 3: Check for inconsistencies
# Example: Ensure 'subscription_type' only contains 'Free' or 'Pro'
print("\nUnique values in 'subscription_type':", df['subscription_type'].unique())


Unique values in 'subscription_type': ['Free' 'Pro']


In [15]:
print("\nDataset Summary:")
print(df.info())



Dataset Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            20000 non-null  int64 
 1   install_date       20000 non-null  object
 2   last_active_date   20000 non-null  object
 3   subscription_type  20000 non-null  object
 4   country            20000 non-null  object
 5   total_sessions     20000 non-null  int64 
 6   page_views         20000 non-null  int64 
 7   download_clicks    20000 non-null  int64 
 8   activation_status  20000 non-null  int64 
 9   days_active        20000 non-null  int64 
 10  pro_upgrade_date   20000 non-null  object
 11  plan_type          20000 non-null  object
 12  monthly_revenue    20000 non-null  int64 
 13  churned            20000 non-null  int64 
dtypes: int64(8), object(6)
memory usage: 2.1+ MB
None


In [13]:
print(df.describe(include='all'))

             user_id install_date last_active_date subscription_type country  \
count   20000.000000        20000            20000             20000   20000   
unique           NaN          366              357                 2       7   
top              NaN    8/13/2023         1/1/2024              Free   India   
freq             NaN           83              328             15971    2914   
mean    10000.500000          NaN              NaN               NaN     NaN   
std      5773.647028          NaN              NaN               NaN     NaN   
min         1.000000          NaN              NaN               NaN     NaN   
25%      5000.750000          NaN              NaN               NaN     NaN   
50%     10000.500000          NaN              NaN               NaN     NaN   
75%     15000.250000          NaN              NaN               NaN     NaN   
max     20000.000000          NaN              NaN               NaN     NaN   

        total_sessions    page_views  d

In [16]:
# Step 5: Distribution of Free vs. Pro users
subscription_distribution = df['subscription_type'].value_counts(normalize=True) * 100
print("\nDistribution of Free vs. Pro users:")
print(subscription_distribution)


Distribution of Free vs. Pro users:
subscription_type
Free    79.855
Pro     20.145
Name: proportion, dtype: float64


In [17]:
# Additional summary: Churn rate by subscription type
churn_rate = df.groupby('subscription_type')['churned'].mean() * 100
print("\nChurn rate by subscription type:")
print(churn_rate)


Churn rate by subscription type:
subscription_type
Free    28.595579
Pro     28.245222
Name: churned, dtype: float64


In [18]:
# Save the cleaned dataset (optional)
df.to_csv('cleaned_dataset.csv', index=False)

In [23]:
df2 = pd.read_csv('cleaned_dataset.csv')
df2.head()

Unnamed: 0,user_id,install_date,last_active_date,subscription_type,country,total_sessions,page_views,download_clicks,activation_status,days_active,pro_upgrade_date,plan_type,monthly_revenue,churned
0,1,6/29/2023,7/12/2023,Free,UK,3,15,1,1,13,12/31/2023,Basic,0,1
1,2,4/10/2023,7/25/2023,Free,India,133,665,0,1,106,12/31/2023,Basic,0,0
2,3,10/25/2023,12/7/2023,Free,USA,53,106,0,1,43,12/31/2023,Basic,0,0
3,4,8/26/2023,11/9/2023,Pro,Canada,242,242,0,1,75,11/9/2023,Basic,49,0
4,5,5/14/2023,11/22/2023,Free,UK,12,48,0,1,192,12/31/2023,Basic,0,0
