[January 10, 2023]
Analyzing Survey Data:

In [None]:
import pandas as pd
# pd tells python to look at pandas library
# January 10, 2023
df = pd.read_csv("survey_results_public.csv")
# df = creates new variable referring to dataframe
# .read_csv('survey_results_public.csv') uses .read_csv() function 
#  to read file 'survey_results_public.csv' 

df.head()
# .head() function tells python what DataFrame to look at
# .head() displays first five rows by default
# df.head() specifies we want to look at csv data stored as df =

In [None]:
df.shape
# .shape() gives the size of data set

In [None]:
# Analyzing Multiple Choice Survey Questions
# Yes or No Question: 'Do you think people born today will
# have a better life than their parents?' 
df['BetterLife'].value_counts()
# .value_counts() function looks at a single column of data
# and counts how many instances of each unique entry
# single column is a Series in pandas:
# referred to as Series.value_counts()
# df['BetterLife'] = specific Series or dataframe/column we 
# want to look at.

In [None]:
df['BetterLife'].value_counts(normalize=True)
# normalize=True: passed as arguement w/n value_counts() to 
# represent total number of responses as a percentage.

In [None]:
# Yes or No Question: 'Do you belive that you 
# to be a manager to make more money?'
df['MgrMoney'].value_counts(normalize=True)

In [None]:
# Plotting Multiple Choice Answers:
%matplotlib inline
# %matplotlib inline is an instruction that tells Jupyter
# to display our charts inline.
df['SocialMedia'].value_counts().plot(kind="bar", figsize=(15,7), color="#61d199")
# .plot(kind='bar') tells python to plot results in a bar graph
# replacing 'bar' with 'pie' will chart results as a pie chart
# adjusting/modifying chart by adding arguements to .plot()
# figsize defines size of chart in the form of a 
# width and height in inches
# color defines the color of the bars

In [None]:
said_no = df[df['BetterLife'] == 'No']
# said_no creates new variable and makes it equal to
# what's on the right of the equal sign
# df makes said_no equivalent to the DataFrame; then
# [df['BetterLife'] == 'No'] tells python to only include
# rows from df in which answer in 'BetterLife' column
# is equal to (==) 'No'. 
said_no.head(3)

In [None]:
said_no.shape
# give the size of the dataset for the 'BetterLife' 
# column w/n dataframe, specifically those who answered 'No'.

In [None]:
said_no['BetterLife'].value_counts()

In [None]:
said_yes = df[df['BetterLife'] == 'Yes']
said_yes.head(3)

In [None]:
said_yes.shape

In [None]:
said_yes['BetterLife'].value_counts()

In [None]:
print(said_no['Age'].mean(),
    said_yes['Age'].mean(),
    said_no['Age'].median(),
    said_yes['Age'].median()
    )

In [None]:
over50 = df[df['Age'] >= 50]
under25 = df[df['Age'] <= 25]
print('Over 50')
print(over50['BetterLife'].value_counts(normalize=True))
print()
print('Under 25')
print(under25['BetterLife'].value_counts(normalize=True))

In [None]:
print(len(over50))
print(len(under25))

[January 16, 2023]
Filtering More Specific Subsets:

In [None]:
# Using Boolean operatos &,~ for 
# more specific subsets

# & allows us to string two Booleans together and return True
# only if both conditions are true.

# &~ means 'and not', using b/n two booleans will
# return a row only when the first Boolean evaluates
# to True but the second is False. 

filtered_1 = df[(df['BetterLife'] == 'Yes') & (df['Country'] == 'India')]
filtered_1

In [None]:
print(filtered_1['BetterLife'].value_counts())
print(filtered_1['Country'].value_counts())

In [None]:
# Filtering only people who:
# Answered Yes to the better life question
# Are over age 50
# Live in India
# Do NOT code as a hobby
# Contribute to open source projects at least occasionally
filtered = df[(df['BetterLife'] == 'Yes') & (df['Age'] >= 50) & (df['Country'] == 'India') &~ (df['Hobbyist'] == "Yes") &~ (df["OpenSourcer"] == 'Never')]
filtered

In [None]:
print(filtered['BetterLife'].value_counts())
print()
print(filtered['Age'].value_counts())
print()
print(filtered['Country'].value_counts())
print()
print(filtered['Hobbyist'].value_counts())
print()
print(filtered['OpenSourcer'].value_counts())

In [None]:
# Analyzing Multi-Answer Survey Questions
df['LanguageWorkedWith'].head()

In [None]:
# Analyzing Multi-Answer Survey for specifics.
# How many developers are using Python?
#   Using Pandas built-in method with:
#       Series.str.contains to look at each row in the series
#           "LanguageWorkedWith" column, and determine whether it contains string
# we give it as an arguement. Returns True if it is contained, otherwise it will return False.
python_bool = df["LanguageWorkedWith"].str.contains('Python')
python_bool.value_counts(normalize=True)

In [None]:
# Using series.string.split to split each row in the series based a delimited we pass to 
# that function as an arguement. (delimiter is ;) so we use .str.split(';').
# Also add an extra arguement, expand=True to str.split().
#   This will create a new DF from series by making each language its own column.
lang_lists = df["LanguageWorkedWith"].str.split(';', expand=True)
lang_lists.head()

In [None]:
# Analyzing to see the number of times each language was mentioned in total.
# df.stack() to stack this DataFrame, slicing each column and then stacking them on top 
# of each other so every data point appears in a single pandas Series.
#   use value_counts() on this new "stacked" series to get the total number of times
#       each language is mentioned.
lang_lists.stack().value_counts().plot(kind='bar', figsize=(15,7), color='#61d199')
print('Projected completed January 16, 2023')
print('Project domain: https://www.dataquest.io/blog/how-to-analyze-survey-data-python-beginner/')