In [2]:
import pandas as pd

df = pd.read_csv('user_behavior_dataset.csv')

df.head()

Unnamed: 0,User ID,Device Model,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class
0,1,Google Pixel 5,Android,393,6.4,1872,67,1122,40,Male,4
1,2,OnePlus 9,Android,268,4.7,1331,42,944,47,Female,3
2,3,Xiaomi Mi 11,Android,154,4.0,761,32,322,42,Male,2
3,4,Google Pixel 5,Android,239,4.8,1676,56,871,20,Male,3
4,5,iPhone 12,iOS,187,4.3,1367,58,988,31,Female,3


In [3]:
df.columns

Index(['User ID', 'Device Model', 'Operating System',
       'App Usage Time (min/day)', 'Screen On Time (hours/day)',
       'Battery Drain (mAh/day)', 'Number of Apps Installed',
       'Data Usage (MB/day)', 'Age', 'Gender', 'User Behavior Class'],
      dtype='object')

In [4]:
print("Number of rows in Dataset: ", df.shape[0])
print("Number of Columns in Dataset: ",df.shape[1])

Number of rows in Dataset:  700
Number of Columns in Dataset:  11


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User ID                     700 non-null    int64  
 1   Device Model                700 non-null    object 
 2   Operating System            700 non-null    object 
 3   App Usage Time (min/day)    700 non-null    int64  
 4   Screen On Time (hours/day)  700 non-null    float64
 5   Battery Drain (mAh/day)     700 non-null    int64  
 6   Number of Apps Installed    700 non-null    int64  
 7   Data Usage (MB/day)         700 non-null    int64  
 8   Age                         700 non-null    int64  
 9   Gender                      700 non-null    object 
 10  User Behavior Class         700 non-null    int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 60.3+ KB


- 3 types of data (int, float and objects (categorical))
- Indicates no null value in the dataset

In [6]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [7]:
fig = px.bar(x=df.isnull().sum().index
       , y=df.isnull().sum().values
       , labels={'x': 'Name of Columns', 'y': 'Count of Null Values'}
       , title='Null Value Count by Column'
       )

fig.update_layout(title=dict(x=0.5, xanchor='center'))

fig.show()

- We can see it visually also that there is no null or missing values

#### Different Usages Types

In [8]:
Behav_class = {1:'Light Use',2:'Moderate Use',3:'High Use',4:'Very High Use',5:'Extreme Use'}

In [9]:
df['User Behavior Class']=df['User Behavior Class'].replace(Behav_class)

In [10]:
df['User Behavior Class'].value_counts().sort_values(ascending=False)

User Behavior Class
Moderate Use     146
High Use         143
Very High Use    139
Extreme Use      136
Light Use        136
Name: count, dtype: int64

In [11]:
BehaveClass_Count = df['User Behavior Class'].value_counts()

fig1 = px.bar(x=BehaveClass_Count.index
              , y = BehaveClass_Count.values
              , text = BehaveClass_Count.values
)

#Behav_class = ['Light Use','Moderate Use','High Use','Very High Use','Extreme Use']
fig1.update_layout(
    # xaxis=dict(tickmode='array'
    #     , tickvals=BehaveClass_Count.index
    #     , ticktext=Behav_class 
    # )
    # , 
    title=dict(text='User Count of Different Behaviors', x=0.5, xanchor='center') 
    , xaxis_title='Usage Behavior Type'
    , yaxis_title='Number of Users'
)

fig1.show()

In [12]:
df.Age.min()

np.int64(18)

In [13]:
bins = [18, 25, 35, 45, 54, df.Age.max()] 
labels = ['18-25', '26-35', '36-45', '46-54','55-'+str(df.Age.max())]

df['Age_Group']=pd.cut(df['Age']
                        , bins=bins
                        , labels=labels
                        , include_lowest=True
                        , right=True)

In [14]:
# df.drop(columns='Age_Group',axis=1,inplace=True)

In [15]:
df['Age_Group'].value_counts()

Age_Group
26-35    176
36-45    165
46-54    146
18-25    131
55-59     82
Name: count, dtype: int64

In [16]:
d = df.groupby(['Age_Group','User Behavior Class'])['User Behavior Class'].agg(Count_Value='count').reset_index()





In [17]:
fig_Age_Behav = px.bar(data_frame= d
       , x= 'User Behavior Class'
       # , color='Age_Group'
       , y='Count_Value'
       , facet_col='Age_Group'
       ,)

fig_Age_Behav.update_layout(title=dict(text='Age Group Wise Usage Behavior', x=0.5, xanchor='center')
                            , xaxis_title='Usage Behavior Type'
                            , yaxis_title='Number of Users'
    )

fig_Age_Behav.show()

In [18]:
px.line(data_frame=d
        ,x='Age_Group'
        ,color='User Behavior Class'
        ,y='Count_Value')

- Age group 26-35 has most number of Extreme user, and contradict to the fact that highest number of Light users are also in this group

In [19]:
behaviors = d['User Behavior Class'].unique()

fig_Behav_Age_Pie = make_subplots(rows = 1, cols = len(behaviors)
                    , specs = [[{'type': 'domain'}] * len(behaviors)]
                    , subplot_titles = behaviors)

# Adding pie chart for each behavior class
for i, behavior in enumerate(behaviors):
    filtered_data = d[d['User Behavior Class'] == behavior]
    
    fig_Behav_Age_Pie.add_trace(
        go.Pie(labels = filtered_data['Age_Group']
               , values = filtered_data['Count_Value'] 
               , name = behavior
               , hole = 0.4 
        ),
        row=1, col=i + 1
    )

fig_Behav_Age_Pie.update_layout(title = dict(text="User Behavior by Age Group", x=0.5, xanchor='center')
                , showlegend = True 
                , legend_title = "Age Group"
)

fig_Behav_Age_Pie.show()


- Apart from High use and Very High use, 26-35 age group dominantign other use type.
- Around 23.8% useres are in 36-45 age group who use mobile highly. While very high use is done highest amount by both 18-25 age group and 46-54 age group
- Two contradictory statement we find that Extreme and Light both types of use are occured most by ***26-35*** age group.    

Work with 26-35 age group

In [20]:
df.columns

Index(['User ID', 'Device Model', 'Operating System',
       'App Usage Time (min/day)', 'Screen On Time (hours/day)',
       'Battery Drain (mAh/day)', 'Number of Apps Installed',
       'Data Usage (MB/day)', 'Age', 'Gender', 'User Behavior Class',
       'Age_Group'],
      dtype='object')

In [21]:
Age26_35_df = df[df['Age_Group']=='26-35'].reset_index()

In [22]:
Age26_35_df.drop(columns='index',axis=1,inplace=True)

In [23]:
Age26_35_df.shape

(176, 12)

In [24]:
Age26_35_df['Gender'].value_counts().index[0]

for i,x in Age26_35_df['Gender'].value_counts().items():
    print('Number of '+i+' are in the age group is: ',x)

Number of Female are in the age group is:  94
Number of Male are in the age group is:  82


In [25]:
Device_Model_26_35 = Age26_35_df['Device Model'].value_counts()

In [26]:
fig_device_model_ag = px.bar(data_frame= Device_Model_26_35
                             , x=Device_Model_26_35.index
                             , y=Device_Model_26_35.values
                             , text=Device_Model_26_35.values
                             )

fig_device_model_ag.update_layout(title= dict(text='Device wise user counts in Age between 26 and 35'
                                              ,x=0.5
                                              , xanchor='center')
                                , yaxis_title='User Count')

fig_device_model_ag.update_traces(
    hovertemplate='Device: %{x}<br>User Count: %{y}<extra></extra>' 
)

fig_device_model_ag.show()

Only just, most of the users in this age group use ***Xiaomi Mi 11***  

In [27]:
Age26_35_df

Unnamed: 0,User ID,Device Model,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class,Age_Group
0,5,iPhone 12,iOS,187,4.3,1367,58,988,31,Female,High Use,26-35
1,6,Google Pixel 5,Android,99,2.0,940,35,564,31,Male,Moderate Use,26-35
2,8,OnePlus 9,Android,543,11.4,2956,82,1702,31,Male,Extreme Use,26-35
3,11,Google Pixel 5,Android,53,1.4,435,17,162,34,Female,Light Use,26-35
4,19,Google Pixel 5,Android,81,1.4,558,16,297,26,Female,Light Use,26-35
...,...,...,...,...,...,...,...,...,...,...,...,...
171,682,Xiaomi Mi 11,Android,380,7.6,2354,77,1191,30,Male,Very High Use,26-35
172,690,Samsung Galaxy S21,Android,541,9.5,2424,98,1550,32,Male,Extreme Use,26-35
173,691,Google Pixel 5,Android,195,5.7,1447,48,679,30,Male,High Use,26-35
174,694,Xiaomi Mi 11,Android,505,8.6,2792,82,1709,31,Male,Extreme Use,26-35


In [28]:
App_Use_time = Age26_35_df.groupby('Device Model')['App Usage Time (min/day)'].agg(Minimum='min', Maximum='max', Average='mean')

In [29]:
Age26_35_df['Screen On Time (min/day)'] = Age26_35_df['Screen On Time (hours/day)']*60

In [30]:
Screen_on_time = Age26_35_df.groupby('Device Model')['Screen On Time (min/day)'].agg(Minimum='min', Maximum='max', Average='mean')


In [31]:
App_installed = Age26_35_df.groupby('Device Model')['Number of Apps Installed'].agg(Minimum='min', Maximum='max', Average='mean')

In [32]:
merged_series_= pd.merge(left=App_Use_time
         , right=Screen_on_time
         , left_index=True
         , right_index=True
         , suffixes=['_App_Use_Time', '_Screen_On_Time']
         )

In [33]:
final = pd.merge(left=merged_series_
                 , right=App_installed
                 , left_index=True
                 , right_index=True
                 )

In [34]:
final.rename(columns={'Minimum':'Minimum_App_Installed',	'Maximum':'Maximum_App_Installed',	'Average':'Average_App_Installed'},inplace=True)

In [35]:
final

Unnamed: 0_level_0,Minimum_App_Use_Time,Maximum_App_Use_Time,Average_App_Use_Time,Minimum_Screen_On_Time,Maximum_Screen_On_Time,Average_Screen_On_Time,Minimum_App_Installed,Maximum_App_Installed,Average_App_Installed
Device Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Google Pixel 5,53,595,240.4,66.0,678.0,267.6,14,97,44.628571
OnePlus 9,30,592,250.0625,66.0,720.0,282.375,10,95,45.75
Samsung Galaxy S21,30,593,237.84375,66.0,708.0,298.6875,11,98,45.21875
Xiaomi Mi 11,39,597,270.45,66.0,714.0,331.5,10,93,49.925
iPhone 12,41,597,296.459459,72.0,684.0,342.972973,10,98,55.216216


In [36]:
fig_Device_Average = px.bar(data_frame=final
             , x=final.index
             , y=['Average_App_Installed','Average_App_Use_Time','Average_Screen_On_Time']
             , barmode='group'
            #  , labels={'Average_App_Installed':'Average no of App Installed'
            #            , 'Average_App_Use_Time':'Average App Use Time'
            #            , 'Average_Screen_On_Time':'Average Screen On Time'}
             , text_auto=True)

fig_Device_Average.update_layout(title=dict(text='Device wise Average Application, Usage Time', x=0.5, xanchor='center')
                                 , yaxis_title='Average Value of Different Factors')

fig_Device_Average.update_traces(
    hovertemplate='Device Model: %{x}<br>Average of Factor: %{y}<extra></extra>' 
)

fig_Device_Average.for_each_trace(lambda t: t.update(name={
    'Average_App_Installed':'Average no of App Installed'
    , 'Average_App_Use_Time': 'Average App Usage Time'
    , 'Average_Screen_On_Time': 'Average Screen On Time'
}[t.name]))

fig_Device_Average.show()

Hypothesis test: 
    - Females from 26-35 Age group have less screen time than males

H0: females' screen time >= males' screen time <br>
Ha: females' screen time < males' screen time

In [79]:
from scipy import stats

In [71]:
Female_count = Age26_35_df[Age26_35_df['Gender']=='Female']['Gender'].count()
Female_mean = Age26_35_df[Age26_35_df['Gender']=='Female']['Screen On Time (min/day)'].mean()
Female_std = Age26_35_df[Age26_35_df['Gender']=='Female']['Screen On Time (min/day)'].std()


Male_count = Age26_35_df[Age26_35_df['Gender']=='Male']['Gender'].count()
Male_mean = Age26_35_df[Age26_35_df['Gender']=='Male']['Screen On Time (min/day)'].mean()
Male_std = Age26_35_df[Age26_35_df['Gender']=='Male']['Screen On Time (min/day)'].std()

In [38]:
alpha = 0.05

In [72]:
t_stat = (Female_mean - Male_mean) / ((Female_std**2 / Female_count) + (Male_std**2 / Male_count))**0.5

In [77]:
dof = ((Female_std**2 / Female_count) + (Male_std**2 / Male_count))**2 / \
     ((Female_std**2 / Female_count)**2 / (Female_count - 1) + (Male_std**2 / Male_count)**2 / (Male_count - 1))

In [81]:
p_value = stats.t.cdf(t_stat, dof)

In [83]:
if p_value<alpha:
    print('Reject the null hypothesis')
else:
    print('Fail to reject the null hypothesis')

Fail to reject the null hypothesis


So, for 95% confidence level females from 26-35 Age group don't have less screen time than males counterpart.