In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

# Countries

In [None]:
df_country = pd.read_csv('so_yearly/so_countries_2015-20.csv')

In [None]:
countries = df_country.query("year == 2020").sort_values(by='count', ascending=False).country[:10].values.tolist()

graph_df = df_country[df_country.country.isin(countries)]
px.line(graph_df,
           animation_frame="year",
           x="country",
           y="count",
           hover_name="country",
            range_y=[0,25000]
           )

In [None]:
df_country.head()

Unnamed: 0,country,count,year
0,United States,4745,2015
1,India,2461,2015
2,United Kingdom,2403,2015
3,Germany,1976,2015
4,Poland,833,2015


# Databases

In [None]:
df_database = pd.read_csv('so_yearly/so_databases_2017-20.csv')
df_database.head()

Unnamed: 0,database,count,year
0,MySQL,16375,2017
1,SQLite,7838,2017
2,MongoDB,6192,2017
3,Redis,4143,2017
4,SQL Server,11358,2017


In [None]:
selected_databases = ['oracle', 'mysql', 'redis', 'cassandra', 'postgresql']

filtered_list = []

def databases_validator(row):
    for db in selected_databases:
        if(db in row.database.lower()):
            filtered_list.append({'database': db, 'count': row['count'], 'year':row['year']})

_ = df_database.agg(databases_validator, axis=1)
print(len(filtered_list))

20


In [None]:
df_filtered_databases = pd.DataFrame(filtered_list).groupby(['database', 'year']).agg({'count': 'sum'}).reset_index()

In [None]:

px.line(df_filtered_databases,
           animation_frame="year",
           x="database",
           y="count",
           hover_name="database",
            range_y=[0,50000]
           )

# Dev-Types

In [None]:
df_devtype = pd.read_csv('so_yearly/so_devtypes_2015-20.csv')
df_devtype.head()

Unnamed: 0,dev_type,count,year
0,Full-stack web developer,6765,2017
1,Student,2845,2017
2,Back-end web developer,2104,2017
3,Desktop developer,1735,2017
4,Front-end web developer,1242,2017


In [None]:
common_skills = ['back-end', 'front-end', 'machine learning', 'manager', 'student', 'devops', 'mobile', 'database']
filtered_list = []

def skills_validator(skill):
    for filter_skill in common_skills:
        if(filter_skill in skill.dev_type.lower()):
            # filtered_list_2020[filter_skill] = filtered_list_2020.get(filter_skill, 0) + skill['count']
            filtered_list.append({'devtype': filter_skill, 'count': skill['count'], 'year':skill['year']})

_ = df_devtype.agg(skills_validator, axis=1)
print(len(filtered_list))

53


In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize the count with respect to each year group.

def normalize(feature_name):

    def feature(df):
        result = df.copy()
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
        return result

    return feature

df_filtered_dev_types = pd.DataFrame(filtered_list).groupby(['devtype', 'year']).agg({'count': 'sum'}).reset_index()
df_normalized_devtype = df_filtered_dev_types.groupby("year").apply(normalize('count')).reset_index(drop=True)

In [None]:
px.line(df_filtered_dev_types,
           animation_frame="year",
           x="devtype",
           y="count",
           hover_name="devtype",
            range_y=[0,80000]
           )

In [None]:
px.bar(df_normalized_devtype,
           animation_frame="year",
           x="devtype",
           y="count",
           hover_name="devtype",
           range_y=[0,3],
           title='Normalized Dev Types(Counts)')


# Gender

In [None]:
df_gender = pd.read_csv('so_yearly/so_gender_2015-20.csv')
df_gender.head()

Unnamed: 0,gender,count,year
0,Male,23699,2015
1,Female,1480,2015
2,Prefer not to disclose,437,2015
3,Other,128,2015
4,Male,51388,2016


In [None]:
gendy = df_gender[df_gender.gender.isin(['Male', 'Female'])]
px.bar(gendy,
           animation_frame="year",
           x="gender",
           y="count",
           hover_name="gender",
            range_y=[0,70000]
           )

# px.bar(gendy, x='year', y='count')

# Languages

In [None]:
df_language = pd.read_csv('so_yearly/so_languages-2016-20.csv')
df_language.head()

Unnamed: 0,languages,count,year
0,iOS,4498,2017
1,Objective-C,3202,2017
2,Android,8601,2017
3,Arduino / Raspberry Pi,3797,2017
4,AngularJS,8823,2017


In [None]:
languages = ['python', 'javascript','java', 'scala', 'php', 'go', 'c++', 'c#']

filtered_list = []

def languages_validator(row):
    for lang in languages:
        if(lang == row.languages.lower()):
            filtered_list.append({'language': lang, 'count': row['count'], 'year': row['year']})

_ = df_language.agg(languages_validator, axis=1)
print(len(filtered_list))

40


In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize the count with respect to each year group.

def normalize(feature_name):

    def feature(df):
        result = df.copy()
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)  + 1 
        return result

    return feature

df_filtered_languages = pd.DataFrame(filtered_list).groupby(['language', 'year']).agg({'count': 'sum'}).reset_index()
df_normalized_languages = df_filtered_languages.groupby("year").apply(normalize('count')).reset_index(drop=True)

In [None]:
df_filtered_languages.language.unique()

array(['c', 'c#', 'c++', 'go', 'java', 'javascript', 'php', 'python',
       'scala'], dtype=object)

In [None]:
language = df_language.query("year == 2020").sort_values(by='count', ascending=False).languages[:10].values.tolist()

lang_df = df_language[df_language.languages.isin(language)]
px.bar(df_filtered_languages,
           animation_frame="year",
           x="language",
           y="count",
           hover_name="language",
            range_y=[0,100000],
            title='Popularity of Languages: 2017 - 2020')

In [None]:
px.bar(df_normalized_languages,
           animation_frame="year",
           x="language",
           y="count",
           hover_name="language",
            range_y=[0,3],
            title='Normalized Popularity of Languages: 2017 - 2020')

# Platforms

In [None]:
df_platforms = pd.read_csv('so_yearly/so_platforms_2017-20.csv')
df_platforms.head()

Unnamed: 0,platform,count,year
0,iOS,4782,2017
1,Amazon Web Services (AWS),8183,2017
2,Windows Desktop,11949,2017
3,Linux Desktop,9593,2017
4,Mac OS,5363,2017


In [None]:
platforms = df_platforms.query("year == 2020").sort_values(by='count', ascending=False).platform[:10].values.tolist()

In [None]:
selected_platforms = ['windows', 'mac', 'linux', ]

filtered_list = []

def platforms_validator(row):
    for platform in selected_platforms:
        if(platform in row.platform.strip().lower() and row.platform.lower() != 'Windows Phone'):
            filtered_list.append({'platform': platform, 'count': row['count'], 'year': row['year']})

_ = df_platforms.agg(platforms_validator, axis=1)
print(len(filtered_list))

14


In [None]:
df_selected_platforms = pd.DataFrame(filtered_list).groupby(['platform', 'year']).agg({'count': 'sum'}).reset_index()

In [None]:
px.bar(df_selected_platforms,
           animation_frame="year",
           x="platform",
           y="count",
           hover_name="platform",
            range_y=[0,50000]
           )

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=74369ce2-fcab-46fa-9e44-ec96a35fc584' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>