In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/survey_results_public.csv', index_col='Respondent')
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col='Column')

In [3]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [None]:
df.head(3)

In [6]:
high_salary = (df['ConvertedComp'] > 70000)

In [None]:
df.loc[high_salary]

In [None]:
df.loc[high_salary, ['Country', 'ConvertedComp', 'LanguageWorkedWith']]

In [14]:
# Using countries as filter

# We will create a list of countries and then pass this as filter on the dateset with the name of column i.e. Country

countries = ['United States', 'India', 'United Kingdom', 'Germany', 'Canada']

In [15]:
filt = df['Country'].isin(countries)

In [None]:
df.loc[filt, 'Country']

In [18]:
# df: This is the DataFrame you've been working with.

# ['LanguageWorkedWith'] accesses the 'LanguageWorkedWith' column in the DataFrame df. 
# It assumes that this column contains strings or text data.

# .str.contains('Python', na=False): This is a method applied to the elements of the 'LanguageWorkedWith' column 
# using .str, which is used for string operations on Series (columns containing text data).

# contains('Python'): This method checks if each element in the 'LanguageWorkedWith' column contains the substring 'Python'. 

# It returns a Boolean Series where each element is True if the corresponding element in the 'LanguageWorkedWith' column 
# contains 'Python', and False otherwise.

# na=False is used to specify how missing values (NaN) should be treated. 

# In this case, na=False means that missing values will be treated as False, indicating that they do not contain 'Python'.

# The resulting filter Series will contain True for rows where the 'LanguageWorkedWith' column contains 'Python' and False 
# for rows where it does not. 

# This Series can then be used as a filter to select specific rows from the DataFrame df where the condition is True.

filter = df['LanguageWorkedWith'].str.contains('Python', na=False)

In [None]:
filter

In [None]:
# filter is a Boolean Series that was created to filter rows where the 'LanguageWorkedWith' column contains the substring 'Python'. 

# When you execute df.loc[filter], it will return a new DataFrame containing only the rows where the condition in the filter is True.

df.loc[filter]