# Pandas
Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
built on top of the Python programming language.

In [1]:
import pandas as pd

In [16]:
df = pd.read_csv('data/survey_results_public.csv')

schema_df = pd.read_csv('data/survey_results_schema.csv', index_col='Column')

In [3]:
# shape of dataframe
df.shape

(64461, 61)

In [4]:
# get info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64461 entries, 0 to 64460
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Respondent                    64461 non-null  int64  
 1   MainBranch                    64162 non-null  object 
 2   Hobbyist                      64416 non-null  object 
 3   Age                           45446 non-null  float64
 4   Age1stCode                    57900 non-null  object 
 5   CompFreq                      40069 non-null  object 
 6   CompTotal                     34826 non-null  float64
 7   ConvertedComp                 34756 non-null  float64
 8   Country                       64072 non-null  object 
 9   CurrencyDesc                  45472 non-null  object 
 10  CurrencySymbol                45472 non-null  object 
 11  DatabaseDesireNextYear        44070 non-null  object 
 12  DatabaseWorkedWith            49537 non-null  object 
 13  D

In [5]:
pd.set_option('display.max_columns', 61)
pd.set_option('display.max_rows', 61)

In [6]:
# read top 10. default is 5
schema_df.head(10) 


Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,Age,What is your age (in years)? If you prefer not...
4,Age1stCode,At what age did you write your first line of c...
5,CompFreq,"Is that compensation weekly, monthly, or yearly?"
6,CompTotal,What is your current total compensation (salar...
7,ConvertedComp,Salary converted to annual USD salaries using ...
8,Country,Where do you live?
9,CurrencyDesc,Which currency do you use day-to-day? If your ...


### Series
Series is a one-dimensional array like structure with homogeneous data.

### Dataframes
DataFrame is a two-dimensional array with heterogeneous data.

In [7]:
# converting dictionary into dataframe
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

df1 = pd.DataFrame(people)

df1

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [8]:
# access single column
print(df1['last'])

# access multiple columns
df1[['first','email']]

0    Schafer
1        Doe
2        Doe
Name: last, dtype: object


Unnamed: 0,first,email
0,Corey,CoreyMSchafer@gmail.com
1,Jane,JaneDoe@email.com
2,John,JohnDoe@email.com


In [9]:
# grab the columns
df1.columns

Index(['first', 'last', 'email'], dtype='object')

#### loc and iloc

In [10]:
# with iloc we search with index
df.iloc[[0,2], 2]

0    CoreyMSchafer@gmail.com
2          JohnDoe@email.com
Name: email, dtype: object

In [11]:
# with loc we search with label
df1.loc[[0,1], ['last','email']]

Unnamed: 0,last,email
0,Schafer,CoreyMSchafer@gmail.com
1,Doe,JaneDoe@email.com


In [12]:
# find from survey how many users code as `hobby`
df['Hobbyist'].value_counts()

Yes    50388
No     14028
Name: Hobbyist, dtype: int64

In [13]:
# get first 10 hobbyist to CompFreq
df.loc[0:10, 'Hobbyist':'CompFreq']

Unnamed: 0,Hobbyist,Age,Age1stCode,CompFreq
0,Yes,,13,Monthly
1,No,,19,
2,Yes,,15,
3,Yes,25.0,18,
4,Yes,31.0,16,
5,No,,14,
6,Yes,,18,Monthly
7,Yes,36.0,12,Yearly
8,No,30.0,20,
9,Yes,22.0,14,Yearly


### Indexes

In [14]:
df1.set_index('email', inplace=True)

print(df1.index)

# access an index
df1.loc['JohnDoe@email.com']

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')


first    John
last      Doe
Name: JohnDoe@email.com, dtype: object

In [19]:
schema_df.loc['Hobbyist', 'QuestionText']

'Do you code as a hobby?'

In [22]:
# sort index alphabetically
schema_df.sort_index()

# sort in descending order
schema_df.sort_index(ascending=False)

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
YearsCodePro,"NOT including education, how many years have y..."
YearsCode,"Including any education, how many years have y..."
WorkWeekHrs,"On average, how many hours per week do you wor..."
WelcomeChange,"Compared to last year, how welcome do you feel..."
WebframeWorkedWith,Which web frameworks have you done extensive d...
WebframeDesireNextYear,Which web frameworks have you done extensive d...
UndergradMajor,What was your primary field of study?
Trans,Are you transgender?
SurveyLength,How do you feel about the length of the survey...
SurveyEase,How easy or difficult was this survey to compl...
