In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

data = pd.read_csv('/kaggle/input/social-media-usage-trends-india/Social Media Usage India.csv')
data.head()

In [None]:
data.columns

In [None]:
# Rename the large column names
data = data.rename(columns={
    'How many followers do you have on Instagram? (In case of multiple accounts, please mention the one with the maximum)':'Instagram Followers',
    'How many posts do you have on Instagram?':'Instagram Posts'
})
data.columns

In [None]:
# Primary Analysis
data.info()

In [None]:
data.shape

In [None]:
# Sample Row
data.loc[0]

In [None]:
data['Current Status'].value_counts()

In [None]:
# Since maximum of numerical columns are Object, Below is a function to convert object to numeric
def convert_to_num(x):
    x = x.replace(',','')
    return pd.to_numeric(x)

<b>From our experience, we see that people tend to be online more often on the weekend than on the weekdays. The reason being obvious, that they are busy in schools or work.</b>
<br>
<i><u>My Hypothesis</u><br/>
    <b>Total Social Media usage on the weekend is more than that on weekdays</b></i>

In [None]:
data_c = data[data['Current Status']=='Working professional']
data_c = data[['Total Week Usage','Total Weekend Usage']].applymap(convert_to_num)

Let's use a Histogram to visualize

In [None]:
import matplotlib.pyplot as plt

plt.hist([data_c['Total Week Usage'], data_c['Total Weekend Usage']],label=['Total Week Usage','Total Weekend Usage'])
plt.legend(loc='upper right')
plt.show()

<p>The Histogram gives a good look at the Weekend Usage vs Weekly Usage and it seems our hypothesis is correct.</p>
<p>Let's use a Probability Mass Function to Visualize and be sure about our claim.</p>

In [None]:
# A Probability Mass Function would be more efficient

from collections import OrderedDict

def pmf(seq):
    freq_dict = {}
    for i in seq:
        if i in freq_dict:
            freq_dict[i]+=1
        else:
            freq_dict[i]=1
    N = len(seq)
    pmf_dict={}
    for k,v in freq_dict.items():
        pmf_dict[k]=v/N
    return pmf_dict

pmf_week_usage = pmf(data_c['Total Week Usage'])
pmf_weekend_usage = pmf(data_c['Total Weekend Usage'])

pmf_week_usage_sorted = OrderedDict(sorted(pmf_week_usage.items()))
pmf_weekend_usage_sorted = OrderedDict(sorted(pmf_weekend_usage.items()))

plt.step(list(pmf_week_usage_sorted.keys()),list(pmf_week_usage_sorted.values()), label="Total Week Usage")
plt.step(list(pmf_weekend_usage_sorted.keys()),list(pmf_weekend_usage_sorted.values()), label="Total Weekend Usage")

<i><u>Conclusion</u></i><br/><b>It looks like our Hypothesis is True. From the PMF we can be sure that there are more of Weekend Usage than Weekly Usage</b>

<p><b>Let us now analyze the Instagram and Facebook Usage.</b></p>
<i><u>My Hypothesis</u><br/><b>My friends have been telling me that people are shifting to Instagram from Facebook as a new trend.Let's check that out</b></i>

In [None]:
# Get the appropriate data in a seperate frame
data_c = data[['Total Facebook Usage', 'Total Instagram Usage', 'Total Social Media Usage']]
data_c['Total Facebook Usage'] = data_c['Total Facebook Usage'].map(convert_to_num)
data_c['Total Instagram Usage'] = data_c['Total Instagram Usage'].map(convert_to_num)
data_c['Total Social Media Usage'] = data_c['Total Social Media Usage'].map(convert_to_num)
plt.violinplot(data_c, showmeans=True)

<p>Looks like there are more users of Instagram than Facebook. To be more sure let us get rid of the Non Facebook and Instagram users, i.e. users who have 0 usage for Facebook or Instagram.</p>

In [None]:
data_c.shape

In [None]:
# Let's get rid of data where facebook usage and/or Instagram usage is 0
data_c = data_c[(data_c['Total Facebook Usage']>0)&(data_c['Total Instagram Usage']>0)]
data_c.shape

In [None]:
plt.boxplot(data_c, showmeans=True)

<i><u>Conclusion</u></i><br>
<b>Looking at both the Box Plot and Violin Plot, we can now be sure that people are indeed shifting to Instagram from facebook</b>

<b>Now let us have a analysis amongst Facebook Users, Instagram Users and if and how they relate to the number of Instagram Followers.</b>

In [None]:
# Let's take the complete data
data_c = data[['Age', 'City', 'Current Status',
       'Do you own multiple profiles on Instagram?', 'Gender',
       'Highest Education', 'Location (City Airport Code)', 'Phone OS',
       'State', 'Zone','Instagram Followers','Total Facebook Usage', 'Total Instagram Usage', 'Total Social Media Usage']]
data_c['Total Facebook Usage'] = data_c['Total Facebook Usage'].map(convert_to_num)
data_c['Total Instagram Usage'] = data_c['Total Instagram Usage'].map(convert_to_num)
data_c['Instagram Followers'] = data_c['Instagram Followers'].map(convert_to_num)
data_c['Total Social Media Usage'] = data_c['Total Social Media Usage'].map(convert_to_num)
data_c.head()

In [None]:
axes = pd.plotting.scatter_matrix(data_c[['Instagram Followers','Total Facebook Usage', 'Total Instagram Usage']])
for ax in axes.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')

In [None]:
data_c[['Instagram Followers','Total Facebook Usage', 'Total Instagram Usage']].corr()

<b>Although the Scatter Matrix did not tell us much. We can see from the Correlation Matrix that number of Instagram Followers are negatively correlated with Facebook Usage. Hence, we can say that,<i> more the number of Instagram Followers people get, the less they visit Facebook!</i><br>It is also intuitive and validated from the Correlation Matrix that more Instagram Followers leads to more Instagram Usage and vice-versa.<b>

<b>We see that parents scold their kids when they visit Social Media very often and ask them to concentrate on studies as Social Media might distract them. Let us validate their claims!<br/></b><i><u>My Hypothesis</u><br/><b>People with more degrees tend to have less Total Social Network Usage</b></i>

In [None]:
plt.bar(data_c['Highest Education'],data_c['Total Social Media Usage'])

<i><u>Conclusion</u></i><br><b>Now we have a conclusive proof that our asumption might not be correct! We see that poeple with Post Graduate degrees tend to use social media more than people having Graduation degrees only.<br/>Can this be due to the fact that people with Graduation tend to get into industrial jobs than people with Post Graduation degrees as people with PG degrees tend to get into academics and go for further studies?!</b><br> Let's verify this.<br><br><p><i><u>My Hypothesis</u><br><b>People in Working Profession tend to visit Social Media less than others</b></i>

In [None]:
plt.bar(data_c['Current Status'],data_c['Total Social Media Usage'])
plt.xticks(rotation=20)

<i><u>Conclusion</u></i><br><b>We see from the bar chart that although Working Professionals visit social media less than Students (Intuitve!), but it is not always true that Working Professionals tend to visit Social Media less than other people. We can see that people who have taken Sabbatical visit social media less than Students and Working Professionals. People who are Self Employed are lesser. We can interpret and infer various reasons for this!</b>

<b>Let us go back to our Instagram Analogy. We saw the rise and shift of Instagram from Facebook. Can we also say that Working Professional are shifting to Instagram lesser than others.</b><p><i><u>Hypothesis</u><br><b>Working Professionals are not shifting to Instagram as much as others.</b>

In [None]:
# Let's create a column which has the difference between Instagram usage and Facebook usage
data_c['Insta-FB'] = data_c['Total Instagram Usage'] - data_c['Total Facebook Usage']

In [None]:
plt.bar(data_c['Current Status'],data_c['Insta-FB'])

plt.xticks(rotation=20)

<i><u>Conclusion</u><br><b>We can see from the bar chart that Working Professionals are using as much Facebook as they are using Instagram. This is also almost true for people with Sabbatical. But we see a clear shift to Instagram for Students. The new generation seems to love Instagram!</b>

<b>Now let us talk about Instagram in more detail. How can one get huge number of followers? Can we say that people tend to follow celebraties and so people living in Metropolitan Cities than Non-Metropolitan Cities?</b> (A bit Stereotypical!)<p>Let's check this out!</p>

In [None]:
set(data_c['City'])

In [None]:
metro_city_dict = {'Agra':0,
 'Ahmedabad':1,
 'Allahabad':0,
 'Amritsar':0,
 'Aurangabad':0,
 'Bagdogra':0,
 'Baroda':0,
 'Belgaum':0,
 'Bengaluru':1,
 'Bhavnagar':0,
 'Bhopal':0,
 'Bhubaneshwar':0,
 'Bhuj':0,
 'Chandigarh':0,
 'Chennai':1,
 'Coimbatore':0,
 'Cooch-behar':0,
 'DehraDun':0,
 'Delhi':1,
 'Dibrugarh':0,
 'Durgapur':0,
 'Goa':0,
 'Guwahati':0,
 'Gwalior':0,
 'Hyderabad':1,
 'Indore':0,
 'Jaipur':1,
 'Jammu':0,
 'Jorhat':0,
 'Kandla':0,
 'Kanpur':1,
 'Kochi':0,
 'Kolkata':1,
 'Kulu':0,
 'Lucknow':0,
 'Ludhiana':0,
 'Madurai':1,
 'Mangalore':0,
 'Mumbai':1,
 'Nagpur':1,
 'Nainital':0,
 'Pathankot':0,
 'Patna':1,
 'Pune':1,
 'Raipur':0,
 'Rajkot':0,
 'Ranchi':0,
 'Surat':1,
 'Thiruvananthapuram':0,
 'Tiruchirappalli':0,
 'Udaipur':0,
 'Varanasi':0,
 'Vishakhapatnam':1}
# Based on https://en.wikipedia.org/wiki/List_of_metropolitan_areas_in_India
data_c['Metropolitan'] = data['City'].map(metro_city_dict)
data_c['Metropolitan']

In [None]:
# Let's take the complete data
data_c = data[['Age', 'City', 'Current Status',
       'Do you own multiple profiles on Instagram?', 'Gender',
       'Highest Education', 'Location (City Airport Code)', 'Phone OS',
       'State', 'Zone','Instagram Followers','Total Facebook Usage', 'Total Instagram Usage', 'Total Social Media Usage']]
data_c['Total Facebook Usage'] = data_c['Total Facebook Usage'].map(convert_to_num)
data_c['Total Instagram Usage'] = data_c['Total Instagram Usage'].map(convert_to_num)
data_c['Instagram Followers'] = data_c['Instagram Followers'].map(convert_to_num)
data_c['Total Social Media Usage'] = data_c['Total Social Media Usage'].map(convert_to_num)
data_c.head()

In [None]:
data_c['Metropolitan'] = data['City'].map(metro_city_dict)

In [None]:
followersByCity = data_c.groupby('City')['Instagram Followers'].mean()
followersByCity

In [None]:
followersByCityDf = pd.DataFrame(followersByCity)
followersByCityDf

In [None]:
followersByCityDf['Metropolitan'] = followersByCityDf.index.map(metro_city_dict)
followersByCityDf

In [None]:
followersByCityDf.groupby('Metropolitan')['Instagram Followers'].mean()

<i><u>Conclusion</u><br><b>We see that our stereotypical claim is not true. Everyone can gather followers by talent irrespective of where they belong.</b>

<b>Let us check the same with respect to Standard Deviation.</b><p>It is up to you to think that less spread can infer more surity on gathering followers. You can analyze more on this!</p>

In [None]:
followersByCityDf.groupby('Metropolitan')['Instagram Followers'].std()

<i><u>Conclusion</u><br><b>It seems that there is less spread in Metropolitan Cities for Instagram Followers Count</b></i>

<p>Let's check the same for top 10 cities with respect to mean and standard deviation to check for average followers and spread of followers respectvely. <p><p><b>Top 10 Cities based on Mean of Instagram Followers</b>

In [None]:
followersByCitySTD = data_c.groupby('City')['Instagram Followers'].mean()
followersByCitySTD = followersByCitySTD.sort_values(ascending=False)
followersByCitySTD[:10]

<b>Top 10 Cities based on Standard Deviation of Instagram Followers</b>

In [None]:
followersByCitySTD = data_c.groupby('City')['Instagram Followers'].std()
followersByCitySTD = followersByCitySTD.sort_values()
followersByCitySTD[:10]

<b>Finally Let's see if number of Instagram Followers has anything to do with Gender</b>

In [None]:
followersByGender = data_c.groupby('Gender')['Instagram Followers'].mean()
followersByGender

<b>It is very clear that Female has far more average number of followers than Male</b>