# Pandas 101 (Corey Scfer Youtube Playlist)

In [1]:
# !pip install pandas
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r"./developer_survey_2019/survey_results_public.csv", index_col="Respondent")

In [5]:
df["Age"].nlargest(1000)

Respondent
9833     99.0
11332    99.0
11548    99.0
16214    99.0
20199    99.0
         ... 
81063    59.0
81320    59.0
82317    59.0
82465    59.0
84323    59.0
Name: Age, Length: 1000, dtype: float64

In [10]:
df["Age"].notnull().value_counts()

True     79210
False     9673
Name: Age, dtype: int64

### To set index

In [4]:
df_schema = pd.read_csv(r".\developer_survey_2019\survey_results_schema.csv", index_col="Column")
df_schema

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Respondent,Randomized respondent ID number (not in order ...
MainBranch,Which of the following options best describes ...
Hobbyist,Do you code as a hobby?
OpenSourcer,How often do you contribute to open source?
OpenSource,How do you feel about the quality of open sour...
...,...
Sexuality,Which of the following do you currently identi...
Ethnicity,Which of the following do you identify as? Ple...
Dependents,"Do you have any dependents (e.g., children, el..."
SurveyLength,How do you feel about the length of the survey...


### Access by label using df.loc[]

In [5]:
df_schema.loc["EdLevel", "QuestionText"]

'Which of the following best describes the highest level of formal education that you’ve completed?'

In [6]:
df_schema.loc["Respondent":"OpenSource"]

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Respondent,Randomized respondent ID number (not in order ...
MainBranch,Which of the following options best describes ...
Hobbyist,Do you code as a hobby?
OpenSourcer,How often do you contribute to open source?
OpenSource,How do you feel about the quality of open sour...


In [7]:
## Filtering and using contains method and cross checking with Re

In [8]:
py = df["LanguageWorkedWith"]
import re
patt = re.compile(r"Python")
retest = lambda x: re.search(patt, x)
retest("HTML/CSS;Java;JavaScript;")

In [9]:
# df["LanguageWorkedWith"] == [retest(i) for i in df["LanguageWorkedWith"]]
# df_filter1 = df["LanguageWorkedWith"]  == "Python"

In [10]:
## To replace nan values in a column with some other value

In [11]:
df["LanguageWorkedWith"].fillna("None", inplace =True)  # Replaces nan with string "None"

In [12]:
count= 0
nan_list = []
for index, i in enumerate(df.loc[:, "LanguageWorkedWith"]):
    # print(type(i))
    try:
        if retest(i):
            count += 1
    except Exception:
        nan_list.append((index, i))
#     if retest(i):
print(count)
# print(nan_list)
df_nan_list = pd.DataFrame(nan_list)
df_nan_list



36443


In [13]:
filter1 =( (df.LanguageWorkedWith.str.contains("Python")) &
           (df.LanguageWorkedWith.str.contains("SQL")) &
           (df.LanguageWorkedWith.str.contains("R")) &
           ((df.LanguageWorkedWith.str.len()== 12)))

In [14]:
type(df.LanguageWorkedWith.str)

pandas.core.strings.accessor.StringMethods

In [15]:
df.loc[~filter1, "LanguageWorkedWith"]

Respondent
1                          HTML/CSS;Java;JavaScript;Python
2                                      C++;HTML/CSS;Python
3                                                 HTML/CSS
4                                      C;C++;C#;Python;SQL
5              C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
                               ...                        
88377                        HTML/CSS;JavaScript;Other(s):
88601                                                 None
88802                                                 None
88816                                                 None
88863    Bash/Shell/PowerShell;HTML/CSS;Java;JavaScript...
Name: LanguageWorkedWith, Length: 88726, dtype: object

## use `&` instead of python AND and `|` instead of python OR

### Use of __df.isin()__

In [16]:
countries = [
            "Sierra Leone",
            "Chad"
            ]
filter_countries = df["Country"].isin(countries)

In [17]:
df.loc[filter_countries, ["LanguageWorkedWith","Country"]]

Unnamed: 0_level_0,LanguageWorkedWith,Country
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
559,Bash/Shell/PowerShell;C;C++;C#;HTML/CSS;Java;P...,Sierra Leone
44671,,Sierra Leone
81382,C;HTML/CSS;Java;JavaScript;PHP;SQL,Chad


In [18]:
df.columns

Index(['MainBranch', 'Hobbyist', 'OpenSourcer', 'OpenSource', 'Employment',
       'Country', 'Student', 'EdLevel', 'UndergradMajor', 'EduOther',
       'OrgSize', 'DevType', 'YearsCode', 'Age1stCode', 'YearsCodePro',
       'CareerSat', 'JobSat', 'MgrIdiot', 'MgrMoney', 'MgrWant', 'JobSeek',
       'LastHireDate', 'LastInt', 'FizzBuzz', 'JobFactors', 'ResumeUpdate',
       'CurrencySymbol', 'CurrencyDesc', 'CompTotal', 'CompFreq',
       'ConvertedComp', 'WorkWeekHrs', 'WorkPlan', 'WorkChallenge',
       'WorkRemote', 'WorkLoc', 'ImpSyn', 'CodeRev', 'CodeRevHrs', 'UnitTests',
       'PurchaseHow', 'PurchaseWhat', 'LanguageWorkedWith',
       'LanguageDesireNextYear', 'DatabaseWorkedWith',
       'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'WebFrameWorkedWith',
       'WebFrameDesireNextYear', 'MiscTechWorkedWith',
       'MiscTechDesireNextYear', 'DevEnviron', 'OpSys', 'Containers',
       'BlockchainOrg', 'BlockchainIs', 'BetterLife', 'ITperson', 

## Change Headers or Column naemes in Pandas
> use `df.columns = headers_list` to modify all columns
> To change column name something to specific

In [19]:
# TO modify the header list in pandas to uppercase it
df.columns = [x.capitalize() for x in df.columns]

In [20]:
df.columns

Index(['Mainbranch', 'Hobbyist', 'Opensourcer', 'Opensource', 'Employment',
       'Country', 'Student', 'Edlevel', 'Undergradmajor', 'Eduother',
       'Orgsize', 'Devtype', 'Yearscode', 'Age1stcode', 'Yearscodepro',
       'Careersat', 'Jobsat', 'Mgridiot', 'Mgrmoney', 'Mgrwant', 'Jobseek',
       'Lasthiredate', 'Lastint', 'Fizzbuzz', 'Jobfactors', 'Resumeupdate',
       'Currencysymbol', 'Currencydesc', 'Comptotal', 'Compfreq',
       'Convertedcomp', 'Workweekhrs', 'Workplan', 'Workchallenge',
       'Workremote', 'Workloc', 'Impsyn', 'Coderev', 'Coderevhrs', 'Unittests',
       'Purchasehow', 'Purchasewhat', 'Languageworkedwith',
       'Languagedesirenextyear', 'Databaseworkedwith',
       'Databasedesirenextyear', 'Platformworkedwith',
       'Platformdesirenextyear', 'Webframeworkedwith',
       'Webframedesirenextyear', 'Misctechworkedwith',
       'Misctechdesirenextyear', 'Devenviron', 'Opsys', 'Containers',
       'Blockchainorg', 'Blockchainis', 'Betterlife', 'Itperson', 

In [21]:
df.rename(columns={"MAINBRANCH":"MAIN BRANCH"}, inplace=True)

In [22]:
df

Unnamed: 0_level_0,Mainbranch,Hobbyist,Opensourcer,Opensource,Employment,Country,Student,Edlevel,Undergradmajor,Eduother,...,Welcomechange,Sonewcontent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,Surveylength,Surveyease
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88377,,Yes,Less than once a month but more than once per ...,The quality of OSS and closed source software ...,"Not employed, and not looking for work",Canada,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,,Tech articles written by other developers;Tech...,,Man,No,,,No,Appropriate in length,Easy
88601,,No,Never,The quality of OSS and closed source software ...,,,,,,,...,,,,,,,,,,
88802,,No,Never,,Employed full-time,,,,,,...,,,,,,,,,,
88816,,No,Never,"OSS is, on average, of HIGHER quality than pro...","Independent contractor, freelancer, or self-em...",,,,,,...,,,,,,,,,,


In [23]:
df.columns.str.replace(" ","_",)

Index(['Mainbranch', 'Hobbyist', 'Opensourcer', 'Opensource', 'Employment',
       'Country', 'Student', 'Edlevel', 'Undergradmajor', 'Eduother',
       'Orgsize', 'Devtype', 'Yearscode', 'Age1stcode', 'Yearscodepro',
       'Careersat', 'Jobsat', 'Mgridiot', 'Mgrmoney', 'Mgrwant', 'Jobseek',
       'Lasthiredate', 'Lastint', 'Fizzbuzz', 'Jobfactors', 'Resumeupdate',
       'Currencysymbol', 'Currencydesc', 'Comptotal', 'Compfreq',
       'Convertedcomp', 'Workweekhrs', 'Workplan', 'Workchallenge',
       'Workremote', 'Workloc', 'Impsyn', 'Coderev', 'Coderevhrs', 'Unittests',
       'Purchasehow', 'Purchasewhat', 'Languageworkedwith',
       'Languagedesirenextyear', 'Databaseworkedwith',
       'Databasedesirenextyear', 'Platformworkedwith',
       'Platformdesirenextyear', 'Webframeworkedwith',
       'Webframedesirenextyear', 'Misctechworkedwith',
       'Misctechdesirenextyear', 'Devenviron', 'Opsys', 'Containers',
       'Blockchainorg', 'Blockchainis', 'Betterlife', 'Itperson', 

## In cell replacement

In [24]:
df.loc[147,"Languageworkedwith"] = "Jadoo"

In [25]:
df.loc[147]

Mainbranch                         I am a developer by profession
Hobbyist                                                       No
Opensourcer                                                 Never
Opensource      OSS is, on average, of HIGHER quality than pro...
Employment                                     Employed full-time
                                      ...                        
Sexuality                                 Straight / Heterosexual
Ethnicity                               Hispanic or Latino/Latina
Dependents                                                     No
Surveylength                                Appropriate in length
Surveyease                                                   Easy
Name: 147, Length: 84, dtype: object

In [26]:
# To replace teh entire slice

In [27]:
df.loc[147:150, "Languageworkedwith":"Misctechworkedwith"] = [1,2,3,4,5,6,7,8,9]

In [28]:
df.loc[147:151, "Languageworkedwith":"Misctechworkedwith"]

Unnamed: 0_level_0,Languageworkedwith,Languagedesirenextyear,Databaseworkedwith,Databasedesirenextyear,Platformworkedwith,Platformdesirenextyear,Webframeworkedwith,Webframedesirenextyear,Misctechworkedwith
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
147,1,2,3,4,5,6,7,8.0,9
148,1,2,3,4,5,6,7,8.0,9
149,1,2,3,4,5,6,7,8.0,9
150,1,2,3,4,5,6,7,8.0,9
151,Bash/Shell/PowerShell;C++;C#;HTML/CSS;Java;Jav...,Bash/Shell/PowerShell;C++;C#;Go;Java;Kotlin;Py...,Elasticsearch;MongoDB;Microsoft SQL Server;MySQL,Elasticsearch;MongoDB;Microsoft SQL Server,Android;AWS;Docker;Linux;Microsoft Azure;Windows,Android;AWS;Docker;Kubernetes;Linux;Microsoft ...,Angular/Angular.js;React.js,,.NET;.NET Core


## using df.at is fasteer for scaalar i.e single item and iloc and loc for multiple items

In [29]:
df.at[147,"Languageworkedwith"] = "IBLISH"
df.at[147,"Languageworkedwith"]

'IBLISH'

In [30]:
df.columns

Index(['Mainbranch', 'Hobbyist', 'Opensourcer', 'Opensource', 'Employment',
       'Country', 'Student', 'Edlevel', 'Undergradmajor', 'Eduother',
       'Orgsize', 'Devtype', 'Yearscode', 'Age1stcode', 'Yearscodepro',
       'Careersat', 'Jobsat', 'Mgridiot', 'Mgrmoney', 'Mgrwant', 'Jobseek',
       'Lasthiredate', 'Lastint', 'Fizzbuzz', 'Jobfactors', 'Resumeupdate',
       'Currencysymbol', 'Currencydesc', 'Comptotal', 'Compfreq',
       'Convertedcomp', 'Workweekhrs', 'Workplan', 'Workchallenge',
       'Workremote', 'Workloc', 'Impsyn', 'Coderev', 'Coderevhrs', 'Unittests',
       'Purchasehow', 'Purchasewhat', 'Languageworkedwith',
       'Languagedesirenextyear', 'Databaseworkedwith',
       'Databasedesirenextyear', 'Platformworkedwith',
       'Platformdesirenextyear', 'Webframeworkedwith',
       'Webframedesirenextyear', 'Misctechworkedwith',
       'Misctechdesirenextyear', 'Devenviron', 'Opsys', 'Containers',
       'Blockchainorg', 'Blockchainis', 'Betterlife', 'Itperson', 

In [31]:
## Replace an entire column
df["Trans"] = df["Trans"] + "majja"

In [32]:
df["Trans"].value_counts()

Nomajja     82576
Yesmajja     1031
Name: Trans, dtype: int64

## Apply
It is used to  apply a function
df

In [33]:
def makeold(x):
    """This functions makes you old by 5 Years.
     Blame Thanos"""
    return x + 5
def makeyoung(x):
    """This functions makes you young by 5 Years.
     Thanks Beuty cream"""
    return x - 5


In [34]:
makeold.__doc__

'This functions makes you old by 5 Years.\n     Blame Thanos'

In [35]:
df["Age"] = df["Age"].apply(makeold)

In [36]:
df.loc[1, "Age"]

19.0

In [37]:
df["Age"] = df["Age"].apply(makeyoung)

In [38]:
df.loc[1, "Age"]

14.0

In [39]:
# df[["Age","Hobbyist"]].apply()

In [40]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type=None, args=(), **kwargs) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.
    
    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.
    
    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:
    
        * 0 or 'index': apply function to each column.
        * 1 or 'columns': apply function to each row.
    
    raw : bool, default False
        Determines if row or column is passed as 

In [41]:
df.apply(len, axis="columns").head(5)  # by defaul axis ="rows

Respondent
1    84
2    84
3    84
4    84
5    84
dtype: int64

In [42]:
df["Age"].max()

99.0

In [43]:
df.applymap(len)

TypeError: object of type 'float' has no len()

## Using map in panda

In [None]:
replacing_dict = {"Yes": 1, "No":0}
df["Hobbyist"].map(replacing_dict)

In [None]:
df["Hobbyist"]

In [None]:
df.rename(columns={"Convertedcomp": "Paisa_Paisa"}, inplace = True)

In [None]:
df.columns

In [None]:
df["Hobbyist"] + df["Gender"]

In [None]:
df

In [None]:
df.iloc[-1]

In [None]:
df.append({"Hobbyist": True, "Gender":"Man", "Age": 45}, ignore_index=True).iloc[-1]

In [None]:
df.iloc[-1]

In [None]:
df.drop(index = [88863,88862], inplace=True)

In [None]:
df.loc[88863]

## To drop rows using filter

In [None]:
filter2 = df["Sotimesaved"] == "They were about the same"  # Creates a filter
filter2_index = df[filter2].index  # returns indexes where the filter is matched
# To drop the above filtered indexes use
df.drop(index=filter2_index, inplace=True)

## To Sort DataFrame

In [None]:
# Creats a smaller df for testing purposes
colummns1 = ["Student", "Hobbyist", "Employment", "Country", "Age", "LanguageWorkedWith", "ScreenName", "OpSys"]
testdf = df.loc[1:100, colummns1]

In [None]:
testdf

In [None]:
columns_list = list(df.columns)

In [None]:
Detail_columns_dict = {}
for column in columns_list:
    Detail_columns_dict[column] = df_schema.loc[column, "QuestionText"]

In [None]:
Detail_columns_dict["OpenSourcer"]

In [None]:
testdf

In [None]:
testdf.sort_values(by=["Country", "Hobbyist", "Age"])

In [None]:
test2df=testdf[["ScreenName","Country","Age"]]
test2df.sort_values(by=[ "Country", "ScreenName", "Age"], ascending=[True,True, False],inplace=True)

In [None]:
test2df.sort_index(inplace)

In [None]:
test2df.sort_index(ascending=False)

In [None]:
df[["Country","Student","Age", "ConvertedComp"]].sort_values(by=["Country","ConvertedComp"], ascending=[True, False]).head(50)

In [None]:
df[["ConvertedComp","Age"]].nlargest(5, columns=["ConvertedComp", "Age"])

In [None]:
df[["ConvertedComp","Age"]].nsmallest(5, columns=["ConvertedComp", "Age"])

In [None]:
df.head()

In [None]:
import seaborn as sns

In [None]:
df.rename(columns={"ConvertedComp":"salary"}, inplace=True)
df.rename(columns={"Age":"age"}, inplace=True)


In [None]:
df["salary"].median()

In [None]:
df["age"].median()

In [None]:
df.median()

In [None]:
df.describe()

In [None]:
df["salary"].count(), df["salary"].size

In [47]:
df["Hobbyist"].value_counts(normalize=True)

Yes    0.801694
No     0.198306
Name: Hobbyist, dtype: float64

In [50]:
social_media_valcounts = df["SocialMedia"].value_counts(normalize=True)*100

In [46]:
df_schema.loc["SocialMedia", "QuestionText"]

'What social media site do you use the most?'

In [51]:
social_media_valcounts

Reddit                      17.023343
YouTube                     16.379076
WhatsApp                    15.807051
Facebook                    15.606902
Twitter                     13.498822
Instagram                    7.414996
I don't use social media     6.577685
LinkedIn                     5.330602
WeChat 微信                    0.789938
Snapchat                     0.743750
VK ВКонта́кте                0.714142
Weibo 新浪微博                   0.066322
Youku Tudou 优酷               0.024871
Hello                        0.022502
Name: SocialMedia, dtype: float64

In [54]:
country_grp = df.groupby(["Country"])

In [55]:
country_grp.get_group("India")

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,I code primarily as a hobby,Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",India,,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...","Taught yourself a new language, framework, or ...",...,A lot more welcome now than last year,Tech articles written by other developers;Indu...,24.0,Man,No,Straight / Heterosexual,,,Appropriate in length,Neither easy nor difficult
10,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,India,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",,,...,Somewhat less welcome now than last year,Tech articles written by other developers;Tech...,,,,,,Yes,Too long,Difficult
15,I am a student who is learning to code,Yes,Never,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",India,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,20.0,Man,No,,,Yes,Too long,Neither easy nor difficult
50,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of LOWER quality than prop...",Employed full-time,India,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Another engineering discipline (ex. civil, ele...",Received on-the-job training in software devel...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Tech...,23.0,Man,No,,South Asian,No,Too long,Easy
65,I am a developer by profession,Yes,Never,,Employed full-time,India,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Information systems, information technology, o...",,...,A lot more welcome now than last year,,21.0,Man,No,,,Yes,Appropriate in length,Neither easy nor difficult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77339,,Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,India,"Yes, full-time","Bachelor’s degree (BA, BS, B.Eng., etc.)","Another engineering discipline (ex. civil, ele...",Taken an online course in programming or softw...,...,Not applicable - I did not use Stack Overflow ...,Tech articles written by other developers;Indu...,,,,,,,,
79795,,Yes,Less than once a month but more than once per ...,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,India,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Somewhat more welcome now than last year,Tech meetups or events in your area;Courses on...,,Man,No,Straight / Heterosexual,,No,Too long,Difficult
83862,,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,India,"Yes, full-time","Bachelor’s degree (BA, BS, B.Eng., etc.)",,Participated in a hackathon,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,24.0,Man,No,Straight / Heterosexual,,Yes,Too long,Neither easy nor difficult
84299,,Yes,Never,The quality of OSS and closed source software ...,Employed full-time,India,"Yes, full-time","Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,Taken an online course in programming or softw...,...,Somewhat more welcome now than last year,,,,,,,,,


In [None]:
country_grp["ConvertedComp"].median().loc["India"]

In [62]:
country_grp["ConvertedComp"].agg(["median", "mean", "max", "mean"]).loc["India"]

median    1.008000e+04
mean      2.805766e+04
max       2.000000e+06
mean      2.805766e+04
Name: India, dtype: float64

In [65]:
filt = (df["Country"] == "India" )
df.loc[filt]["LanguageWorkedWith"].str.contains("Python").value_counts()

False    5739
True     3105
Name: LanguageWorkedWith, dtype: int64

In [69]:
country_grp["LanguageWorkedWith"].size()

Country
Afghanistan                              44
Albania                                  86
Algeria                                 134
Andorra                                   7
Angola                                    5
                                       ... 
Venezuela, Bolivarian Republic of...     88
Viet Nam                                231
Yemen                                    19
Zambia                                   12
Zimbabwe                                 39
Name: LanguageWorkedWith, Length: 179, dtype: int64

In [None]:
def percent_lang(x):


In [81]:
percentdf = country_grp["LanguageWorkedWith"].apply(lambda x: x.str.contains("Python").sum())
percentdf

AttributeError: 'Series' object has no attribute 'columns'

In [80]:
countrydf = country_grp["Country"].value_counts()
countrydf.columns

AttributeError: 'Series' object has no attribute 'columns'

In [86]:
percentdf.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Kingdom', 'United Republic of Tanzania', 'United States',
       'Uruguay', 'Uzbekistan', 'Venezuela, Bolivarian Republic of...',
       'Viet Nam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='Country', length=179)

In [104]:
dfres = pd.DataFrame()
c,kp,tc,p, e =[], [], [], [], []

dfres.set_index("Serial_Number", inplace=True)

KeyError: "None of ['Serial_Number'] are in the columns"

In [150]:
import string
import random
str_list = []
no_of_items_on_list = 179
no_of_characters_in_each_string_in_list = 5
letters = string.ascii_letters
while no_of_items_on_list > 0:
    str_list.append(''.join(random.choice(letters) for i in range(no_of_characters_in_each_string_in_list))+"@"+''.join(random.choice(letters) for i in range(no_of_characters_in_each_string_in_list))+"."+random.choice(["com", "org", "edu"]))
    no_of_items_on_list -=1
print(len(str_list))


179


In [139]:
for i in range(len(percentdf.index)):
    # print(percentdf.index[i])
    perctage = (percentdf[i]/countrydf[i]) * 100
    c.append(percentdf.index[i])
    kp.append(percentdf[i])
    tc.append(countrydf[i])
    p.append(perctage)


In [149]:
dfres["CountryName"] = c
dfres["KnowPython"] = kp
dfres["TotalCount"] = tc
dfres["Percentage"] = p
dfres["E-mail"] = str_list

ValueError: Length of values (537) does not match length of index (179)

In [129]:
dfres.set_index("CountryName", inplace=True)

KeyError: "None of ['CountryName'] are in the columns"

In [130]:
dfres.sort_values(by="Percentage", ascending=False)
dfres

Unnamed: 0_level_0,KnowPython,TotalCount,Percentage
CountryName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,8,44,18.181818
Albania,23,86,26.744186
Algeria,40,134,29.850746
Andorra,0,7,0.000000
Angola,2,5,40.000000
...,...,...,...
"Venezuela, Bolivarian Republic of...",28,88,31.818182
Viet Nam,78,231,33.766234
Yemen,3,19,15.789474
Zambia,4,12,33.333333


In [None]:
dfres.loc("United States", axis=0)

In [131]:
dfres.nlargest(20, columns="KnowPython")

Unnamed: 0_level_0,KnowPython,TotalCount,Percentage
CountryName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
United States,10083,20949,48.131176
India,3105,9061,34.267741
Germany,2451,5866,41.783157
United Kingdom,2384,5737,41.55482
Canada,1558,3395,45.891016
France,1054,2391,44.081974
Australia,790,1903,41.5134
Brazil,767,1948,39.373717
Netherlands,767,1852,41.414687
Poland,751,1922,39.073881


In [182]:
df["Age"].mean()

30.336698649160446

In [152]:
dfres["E-mail"] = str_list

In [133]:
df.head(1)

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult


In [153]:
dfres

Unnamed: 0_level_0,KnowPython,TotalCount,Percentage,E-mail
CountryName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,8,44,18.181818,pVFCn@luqqT.org
Albania,23,86,26.744186,gseip@Pnfzz.edu
Algeria,40,134,29.850746,KpRdy@EDPbJ.org
Andorra,0,7,0.000000,sMMze@hNNEU.com
Angola,2,5,40.000000,tjzbO@RRJBX.com
...,...,...,...,...
"Venezuela, Bolivarian Republic of...",28,88,31.818182,reDmS@IfZoH.org
Viet Nam,78,231,33.766234,ItZAa@FThGm.org
Yemen,3,19,15.789474,XnHWt@ZBOWZ.edu
Zambia,4,12,33.333333,XQAzX@hVrhI.org


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [157]:
dftest = df.head(200)

In [158]:
dftest

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,I am a developer by profession,No,Never,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Australia,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","A natural science (ex. biology, chemistry, phy...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,40.0,Man,No,Straight / Heterosexual,White or of European descent,Yes,Appropriate in length,Easy
198,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...","Independent contractor, freelancer, or self-em...",France,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",Taken a part-time in-person course in programm...,...,Just as welcome now as I felt last year,,30.0,Man,No,Straight / Heterosexual,White or of European descent,Yes,Appropriate in length,Neither easy nor difficult
199,"I am not primarily a developer, but I write co...",Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",Netherlands,,"Other doctoral degree (Ph.D, Ed.D., etc.)",Mathematics or statistics,,...,,,,Man,,,White or of European descent,,,
200,I am a developer by profession,No,Less than once per year,The quality of OSS and closed source software ...,Employed part-time,Germany,No,"Professional degree (JD, MD, etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Cour...,43.0,Man,No,,,Yes,Appropriate in length,Easy


In [161]:
dftest.to_csv("dftest.csv")

In [162]:
dftest= pd.read_csv("dftest.csv")

In [172]:
dftest["Age"].median()

29.0

In [174]:
import numpy as np

In [175]:
dftest["Age"].replace(np.nan,29, inplace = True )

In [176]:
dftest

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
4,5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,197,I am a developer by profession,No,Never,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Australia,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","A natural science (ex. biology, chemistry, phy...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,40.0,Man,No,Straight / Heterosexual,White or of European descent,Yes,Appropriate in length,Easy
196,198,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...","Independent contractor, freelancer, or self-em...",France,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,,30.0,Man,No,Straight / Heterosexual,White or of European descent,Yes,Appropriate in length,Neither easy nor difficult
197,199,"I am not primarily a developer, but I write co...",Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",Netherlands,,"Other doctoral degree (Ph.D, Ed.D., etc.)",Mathematics or statistics,...,,,29.0,Man,,,White or of European descent,,,
198,200,I am a developer by profession,No,Less than once per year,The quality of OSS and closed source software ...,Employed part-time,Germany,No,"Professional degree (JD, MD, etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Cour...,43.0,Man,No,,,Yes,Appropriate in length,Easy


In [180]:
dftest["Age"].dtypes

dtype('float64')

In [184]:
df.astype("object")

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88377,,Yes,Less than once a month but more than once per ...,The quality of OSS and closed source software ...,"Not employed, and not looking for work",Canada,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,,Tech articles written by other developers;Tech...,,Man,No,,,No,Appropriate in length,Easy
88601,,No,Never,The quality of OSS and closed source software ...,,,,,,,...,,,,,,,,,,
88802,,No,Never,,Employed full-time,,,,,,...,,,,,,,,,,
88816,,No,Never,"OSS is, on average, of HIGHER quality than pro...","Independent contractor, freelancer, or self-em...",,,,,,...,,,,,,,,,,


In [201]:
df["YearsCode"] = df["YearsCode"].astype("float")

In [200]:
listy =[i for i in df["YearsCode"] if i

Series([], Name: YearsCode, dtype: object)

In [209]:
median_ = df["YearsCode"].median()
mean_ = df["YearsCode"].mean()

In [208]:
df["YearsCode"].replace("Less than 1 year", np.nan, inplace=True)
df["YearsCode"].replace("More than 50 years", np.nan, inplace=True)
df["YearsCode"].replace(np.nan, np.nan, inplace=True)


In [210]:
df["YearsCode"].unique()

array([ 4., nan,  3., 16., 13.,  6.,  8., 12.,  2.,  5., 17., 10., 14.,
       35.,  7., 30.,  9., 26., 40., 19., 15., 20., 28., 25.,  1., 22.,
       11., 33., 50., 41., 18., 34., 24., 23., 42., 27., 21., 36., 32.,
       39., 38., 31., 37., 29., 44., 45., 48., 46., 43., 47., 49.])

In [216]:
mean = [1,2,3]
mean_ = [1,2,3]
mean_.append(1)
mean == mean_

False

In [217]:
mean, mean_

([1, 2, 3], [1, 2, 3, 1])

In [218]:
df["YearsCode"].mean()

11.789195442188674