In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
# importing the csv file 
streaming = pd.read_csv('streaming.csv')

In [13]:
#changing headers to lowercase and removing spaces
streaming.columns = streaming.columns.str.lower().str.replace(' ', '_')

# Convert numeric columns
streaming['event_value'] = pd.to_numeric(streaming['event_value'], errors='coerce')
streaming['campaign_id'] = pd.to_numeric(streaming['campaign_id'], errors='coerce')
streaming['adset_id'] = pd.to_numeric(streaming['adset_id'], errors='coerce')
streaming['ad_id'] = pd.to_numeric(streaming['ad_id'], errors='coerce')

# Convert categorical columns
categorical_cols = ['event_name', 'media_source', 'channel', 'campaign', 'adset', 'ad', 
                    'country_code', 'city', 'device_type', 'device_category', 'platform']

for col in categorical_cols:
    streaming[col] = streaming[col].astype('category')

# Check memory usage before & after
print(streaming.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137691 entries, 0 to 137690
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   attributed_touch_type  133410 non-null  object        
 1   attributed_touch_time  126653 non-null  datetime64[ns]
 2   install_time           137691 non-null  datetime64[ns]
 3   event_time             137691 non-null  datetime64[ns]
 4   event_name             137691 non-null  category      
 5   event_value            0 non-null       float64       
 6   media_source           137556 non-null  category      
 7   channel                71655 non-null   category      
 8   campaign               130867 non-null  category      
 9   campaign_id            83941 non-null   float64       
 10  adset                  121321 non-null  category      
 11  adset_id               82879 non-null   float64       
 12  ad                     55326 non-null   cate

In [16]:
streaming.head()

Unnamed: 0,attributed_touch_type,attributed_touch_time,install_time,event_time,event_name,media_source,campaign,adset,ad,country_code,city,device_type,device_category,platform
0,click,2021-03-05 17:21:00,2021-03-05 17:22:00,2021-03-11 23:59:00,af_media_play,snapchat_int,SNAP_SA_IOS_ACQ_TELCOM_20200804,ALL_AR_TELCOM,AR_LAILAWANOOR_30S_VIDEO,SA,Jalajil,iPhone 11 Pro Max,phone,ios
1,click,2021-03-11 18:58:00,2021-03-11 18:58:00,2021-03-11 23:59:00,af_media_minutes,Apple Search Ads,APPLE_AE_BRAND_EN_ACQ,APPLE_AE_BRAND_EN_ACQ_IPHONE_EX,,AE,Al Qawz,"iPhone13,3",phone,ios
2,click,2020-10-25 02:08:00,2020-10-25 02:08:00,2021-03-11 23:59:00,af_media_play,Apple Search Ads,APPLE_SA_BRAND_AR_ACQ,APPLE_SA_BRAND_AR_ACQ_IPHONE_EX,,SA,Riyadh,iPhone 11 Pro,phone,ios
3,click,2021-01-07 23:37:00,2021-01-07 23:38:00,2021-03-11 23:59:00,af_content_view,Omantel,SMS,,,OM,Al Hamra,"iPhone12,8",phone,ios
4,click,2020-07-04 16:11:00,2020-07-04 16:12:00,2021-03-11 23:59:00,af_media_play,Apple Search Ads,APPLE_QA_Discovery,APPLE_QA_Discovery_ALL_ALL,,QA,Al Maamoura,"iPhone13,4",phone,ios


In [15]:
#drop columns that are not needed
streaming = streaming.drop(['event_value', 'channel', 'campaign_id', 'ad_id', 'adset_id'], axis=1)

In [18]:
#print the unique event names in the dataset
print(streaming['event_name'].unique())

#finding the count of event name in the dataset
print(streaming['event_name'].value_counts())


['af_media_play', 'af_media_minutes', 'af_content_view', 'AFEventDownload', 'push_screen_allow', ..., 'af_login_success', 'af_add_to_wishlist', 'af_renewal', 'af_subscribe', 'af_paying_conversion']
Length: 19
Categories (19, object): ['AFEventDownload', 'AFEventVIDEOPLAY', 'af_add_to_wishlist', 'af_content_view', ..., 'af_subscribe', 'push_screen', 'push_screen_allow', 'push_screen_maybe_later']
event_name
af_media_play              107069
af_media_minutes             8798
af_content_view              8421
push_screen                  3185
push_screen_maybe_later      2214
af_initiated_checkout        2177
AFEventVIDEOPLAY             1730
af_signup_success             930
af_media_downloaded           900
push_screen_allow             845
AFEventDownload               425
af_my_list                    328
af_purchase                   167
af_start_trial                163
af_login_success              163
af_renewal                     89
af_subscribe                   43
af_paying_co

In [27]:
pivot = (
    streaming[streaming['event_name'] == 'af_media_downloaded']
    .pivot_table(index='media_source', values='event_name', aggfunc='count')
    .sort_values(by='event_name', ascending=False)
)

print(pivot)

                     event_name
media_source                   
Apple Search Ads            416
snapchat_int                159
restricted                  115
googleadwords_int            55
SMS                          36
Facebook Ads                 28
mobupps_int                  14
Social BIO                   14
wondermars_int               11
rtbhouse_int                 11
appfloodaff_int              10
Mobily KSA                    8
adtiming_int                  7
bytedanceglobal_int           4
taptica_int                   4
Etisalat                      3
appicmedia_int                2
email                         1
playdigo_int                  0
performante_int               0
xapads_int                    0
techido_int                   0
tyroo_int                     0
mobvista_int                  0
urbanconnection_int           0
mobrain_int                   0
mobligent_int                 0
mfaas_int                     0
volomedia_int                 0
inmobi_i

  .pivot_table(index='media_source', values='event_name', aggfunc='count')
